From 7b9d8916e5dde83bb982e0950617738cbfc25f21 Mon Sep 17 00:00:00 2001 From: Vishwak Srinivasan Date: Fri, 20 Jul 2018 09:12:55 -0700 Subject: [PATCH 01/23] Fix integral type dispatch error message (#9625) Summary: This fix will prevent errors like (found in `bincount`) ``` RuntimeError: %s not implemented for '%s'bincounttorch.FloatTensor ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/9625 Differential Revision: D8932945 Pulled By: soumith fbshipit-source-id: 794e3b58d662779402ab318e274661826a5db8b2 --- aten/src/ATen/Dispatch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 63b2d37258d60..6cd8722316297 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -43,7 +43,7 @@ AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ default: \ - AT_ERROR("%s not implemented for '%s'", (NAME), the_type.toString()); \ + AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \ } \ }() From f84fdc7866a566d2a1c31d5006b24c6d6e841846 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Fri, 20 Jul 2018 09:36:50 -0700 Subject: [PATCH 02/23] Remove unnecessary functions from StorageDerived.h Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9553 Reviewed By: ezyang Differential Revision: D8915526 Pulled By: cpuhrsch fbshipit-source-id: 32013d3aa58a1a68637f99ee619d06e27fadaad6 --- aten/src/ATen/Storage.h | 54 ++++++++----- aten/src/ATen/templates/StorageDerived.cpp | 93 ++-------------------- aten/src/ATen/templates/StorageDerived.h | 28 +------ aten/src/ATen/test/scalar_test.cpp | 4 - aten/src/TH/TH.h | 2 +- aten/src/TH/THAllocator.h | 2 +- aten/src/TH/THBlas.h | 2 +- aten/src/TH/THDiskFile.cpp | 2 +- aten/src/TH/THFilePrivate.h | 2 +- aten/src/TH/THGeneral.cpp | 2 +- aten/src/TH/THHalf.h | 2 +- aten/src/TH/THLapack.h | 2 +- aten/src/TH/THLogAdd.h | 2 +- aten/src/TH/THRandom.cpp | 2 +- aten/src/TH/THRandom.h | 2 +- aten/src/TH/THSize.h | 2 +- aten/src/TH/THStorageFunctions.h | 2 +- aten/src/TH/THVector.h | 2 +- aten/src/TH/vector/AVX.cpp | 2 +- aten/src/TH/vector/AVX.h | 2 +- aten/src/TH/vector/AVX2.h | 2 +- aten/src/THC/THCGeneral.h.in | 2 +- 22 files changed, 62 insertions(+), 153 deletions(-) diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index 582a82a0c0aad..3239eb9d54adf 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -1,6 +1,7 @@ #pragma once #include "ATen/Scalar.h" +#include namespace at { @@ -10,32 +11,43 @@ struct Storage { static const char RESIZABLE = 2; Storage() {} + Storage(THStorage* storage) + : storage(storage) {} Storage(const Storage& other) = delete; + virtual ~Storage() { + THStorage_free(storage); + } void operator=(const Storage&) = delete; - virtual ~Storage() {}; virtual size_t elementSize() const = 0; - virtual size_t size() const = 0; - virtual void* data() = 0; - virtual const void* data() const = 0; - virtual Storage& retain() = 0; - virtual Storage& free() = 0; - virtual void * unsafeGetTH(bool retain) const = 0; - - virtual Storage& resize(int64_t new_size) = 0; - + size_t size() const { + return storage->size; + }; + void* data() { + return storage->data_ptr.get(); + }; + const void* data() const { + return storage->data_ptr.get(); + }; + void* unsafeGetTH(bool retain_) const { + if (retain_) { + THStorage_retain(storage); + } + return storage; + } + void retain() { + THStorage_retain(storage); + } virtual Type & type() const = 0; - virtual int getDevice() const = 0; - virtual const char * toString() const = 0; - - virtual Storage& fill(Scalar value) = 0; - virtual Storage& set(size_t ind, Scalar value) = 0; - virtual Storage& fast_set(size_t ind, Scalar value) = 0; - virtual Scalar get(size_t ind) = 0; - virtual Scalar fast_get(size_t ind) = 0; - - virtual void set_flag(char flag) = 0; - virtual void clear_flag(char flag) = 0; + int getDevice() const { + return storage->data_ptr.device().index(); + } + void clear_flag(char flag) { + THStorage_clearFlag(storage, flag); + } + + protected: + THStorage *storage; }; } // namespace at diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp index 83e10b9986d56..28e17e0d0c168 100644 --- a/aten/src/ATen/templates/StorageDerived.cpp +++ b/aten/src/ATen/templates/StorageDerived.cpp @@ -11,17 +11,16 @@ namespace at { ${Storage}::${Storage}(Context* context): - storage(${THStorage}_new(${state})), context(context) {} + Storage(${THStorage}_new(${state})), context(context) {} ${Storage}::${Storage}(Context* context, THStorage* storage): - storage(storage), context(context) {} + Storage(storage), context(context) {} ${Storage}::${Storage}(Context* context, size_t storage_size) - : storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {} + : Storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {} ${Storage}::${Storage}(Context* context, size_t size, Allocator* allocator) - : storage(nullptr), - context(context) { + : Storage(nullptr), context(context) { storage = ${THStorage}_newWithAllocator(${state,} size, allocator); ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE); } @@ -38,7 +37,7 @@ static int getPointerDevice(void* ptr) { ${Storage}::${Storage}(Context* context, void * data, size_t size, const std::function & deleter) - : storage(${THStorage}_newWithDataAndAllocator(${state,} + : Storage(${THStorage}_newWithDataAndAllocator(${state,} InefficientStdFunctionContext::makeDataPtr(data, deleter, #if ${isCUDA} Device(kCUDA, getPointerDevice(data)) @@ -47,98 +46,20 @@ static int getPointerDevice(void* ptr) { #endif ), size, /* allocator */ nullptr - )), - context(context) { + )), context(context) { ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE); } -${Storage}::~${Storage}() { - ${THStorage}_free(${state,} storage); -} +${Storage}::~${Storage}() { } size_t ${Storage}::elementSize() const { return sizeof(${ScalarType}); } -size_t ${Storage}::size() const { - return storage->size; -} - -void* ${Storage}::data() { - return storage->data_ptr.get(); -} - -const void* ${Storage}::data() const { - return storage->data_ptr.get(); -} - -auto ${Storage}::retain() -> ${Storage}& { - ${THStorage}_retain(${state,} storage); - return *this; -} - -auto ${Storage}::free() -> ${Storage}& { - ${THStorage}_free(${state,} storage); - return *this; -} - -void* ${Storage}::unsafeGetTH(bool retain) const { - if (retain) { - ${THStorage}_retain(${state,} storage); - } - return storage; -} - -auto ${Storage}::resize(int64_t new_size) -> ${Storage}& { - ${THStorage}_resize(${state,} storage, new_size); - return *this; -} - -auto ${Storage}::fill(Scalar value) -> ${Storage}& { - ${THStorage}_fill(${state,} storage, ${to_th_type}(value.to${ScalarName}())); - return *this; -} - -auto ${Storage}::set(size_t ind, Scalar value) -> ${Storage}& { - ${THStorage}_set(${state,} storage, ind, ${to_th_type}(value.to${ScalarName}())); - return *this; -} - -auto ${Storage}::fast_set(size_t ind, Scalar value) -> ${Storage}& { - throw std::runtime_error("unsupported operation 'fast_set'"); -} - -auto ${Storage}::get(size_t ind) -> Scalar { - // static cast to fix long -> int64_t issues - return static_cast<${ScalarType}>(${to_at_type}(${THStorage}_get(${state,} storage, ind))); -} - -auto ${Storage}::fast_get(size_t ind) -> Scalar { - if(${isCUDA}) - throw std::runtime_error("unsupported operation 'fast_get'"); - return static_cast<${ScalarType}>(${to_at_type}(storage->unsafe_data<${THScalarType}>()[ind])); -} - -void ${Storage}::set_flag(char flag) { - ${THStorage}_setFlag(${state,} storage, flag); -} - -void ${Storage}::clear_flag(char flag) { - ${THStorage}_clearFlag(${state,} storage, flag); -} - -int ${Storage}::getDevice() const { - return storage->data_ptr.device().index(); -} - Type& ${Storage}::type() const { return context->getType(Backend::${Backend},ScalarType::${ScalarName}); } -const char * ${Storage}::toString() const { - return "${Storage}"; -} - const char * ${Storage}::typeString() { return "${Type}"; } diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h index d97d397c8e7ca..8cfa8c1d01d3b 100644 --- a/aten/src/ATen/templates/StorageDerived.h +++ b/aten/src/ATen/templates/StorageDerived.h @@ -21,36 +21,16 @@ struct ${Storage} final : public Storage { ${Storage}(Context* context, size_t size, Allocator* allocator); ${Storage}(Context* context, void * data, size_t size, const std::function & deleter); - virtual ~${Storage}(); - - virtual size_t elementSize() const override; - virtual size_t size() const override; - virtual void* data() override; - virtual const void* data() const override; - virtual ${Storage}& retain() override; - virtual ${Storage}& free() override; - virtual void * unsafeGetTH(bool retain) const override; - - virtual ${Storage}& resize(int64_t new_size) override; - virtual ${Storage}& fill(Scalar value) override; - virtual ${Storage}& set(size_t ind, Scalar value) override; - virtual ${Storage}& fast_set(size_t ind, Scalar value) override; - virtual Scalar get(size_t ind) override; - virtual Scalar fast_get(size_t ind) override; - - virtual void set_flag(char flag) override; - virtual void clear_flag(char flag) override; - - virtual Type& type() const override; - virtual int getDevice() const override; - virtual const char * toString() const override; + ~${Storage}(); + size_t elementSize() const final; + + Type& type() const final; static const char * typeString(); protected: friend struct ${Type}; - THStorage *storage; Context* context; }; diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index ccdab08dc8208..2880004555a74 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -88,10 +88,6 @@ TEST_CASE( "scalar test", "[]" ) { auto t2 = zeros({4,4}, CUDAFloat); cout << &t2 << "\n"; cout << "AFTER GET TYPE " << &CUDAFloat << "\n"; - auto s = CUDAFloat.storage(4); - REQUIRE( s->get(3).toFloat() == 0.0 ); - s->fill(7); - REQUIRE( s->get(3).toFloat() == 7.0 ); } auto t = ones({4,4}); diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h index 1faf6e52b5a46..46aa394063fbd 100644 --- a/aten/src/TH/TH.h +++ b/aten/src/TH/TH.h @@ -1,7 +1,7 @@ #ifndef TH_INC #define TH_INC -#include "THGeneral.h" +#include #include "THBlas.h" #ifdef USE_LAPACK diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h index 460f23873fff0..578d689400baf 100644 --- a/aten/src/TH/THAllocator.h +++ b/aten/src/TH/THAllocator.h @@ -1,6 +1,6 @@ #pragma once -#include "THGeneral.h" +#include #ifdef __cplusplus #include diff --git a/aten/src/TH/THBlas.h b/aten/src/TH/THBlas.h index 5fef0febcd5e9..284cd7c42c12c 100644 --- a/aten/src/TH/THBlas.h +++ b/aten/src/TH/THBlas.h @@ -1,7 +1,7 @@ #ifndef TH_BLAS_INC #define TH_BLAS_INC -#include "THGeneral.h" +#include #define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME) diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp index 258ad2cbca6a4..a806df6ac62eb 100644 --- a/aten/src/TH/THDiskFile.cpp +++ b/aten/src/TH/THDiskFile.cpp @@ -1,4 +1,4 @@ -#include "THGeneral.h" +#include #include "THDiskFile.h" #include "THFilePrivate.h" diff --git a/aten/src/TH/THFilePrivate.h b/aten/src/TH/THFilePrivate.h index 93bbaa0287ad8..d493510523f3f 100644 --- a/aten/src/TH/THFilePrivate.h +++ b/aten/src/TH/THFilePrivate.h @@ -1,4 +1,4 @@ -#include "THGeneral.h" +#include #include "THHalf.h" diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp index 1b89e17dce997..b7575b6c8add1 100644 --- a/aten/src/TH/THGeneral.cpp +++ b/aten/src/TH/THGeneral.cpp @@ -1,4 +1,4 @@ -#include "THGeneral.h" +#include #ifdef _OPENMP #include diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index 0f9807b50263f..5ff85eb2c8f40 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -1,7 +1,7 @@ #ifndef TH_HALF_H #define TH_HALF_H -#include "THGeneral.h" +#include #include /* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */ diff --git a/aten/src/TH/THLapack.h b/aten/src/TH/THLapack.h index 614d15f940f77..614e4becfa65c 100644 --- a/aten/src/TH/THLapack.h +++ b/aten/src/TH/THLapack.h @@ -1,7 +1,7 @@ #ifndef TH_LAPACK_INC #define TH_LAPACK_INC -#include "THGeneral.h" +#include #define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME) diff --git a/aten/src/TH/THLogAdd.h b/aten/src/TH/THLogAdd.h index 9319b8f4643be..60acaedc14ea3 100644 --- a/aten/src/TH/THLogAdd.h +++ b/aten/src/TH/THLogAdd.h @@ -1,7 +1,7 @@ #ifndef TH_LOG_ADD_INC #define TH_LOG_ADD_INC -#include "THGeneral.h" +#include TH_API const double THLog2Pi; TH_API const double THLogZero; diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp index 8755f774f1d79..6477b13fc46b3 100644 --- a/aten/src/TH/THRandom.cpp +++ b/aten/src/TH/THRandom.cpp @@ -1,4 +1,4 @@ -#include "THGeneral.h" +#include #include "THRandom.h" #include "THGenerator.hpp" diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h index 5460d330d14a2..3641397635521 100644 --- a/aten/src/TH/THRandom.h +++ b/aten/src/TH/THRandom.h @@ -1,7 +1,7 @@ #ifndef TH_RANDOM_INC #define TH_RANDOM_INC -#include "THGeneral.h" +#include #define _MERSENNE_STATE_N 624 #define _MERSENNE_STATE_M 397 diff --git a/aten/src/TH/THSize.h b/aten/src/TH/THSize.h index 2927f21d51262..c190a310772cf 100644 --- a/aten/src/TH/THSize.h +++ b/aten/src/TH/THSize.h @@ -1,7 +1,7 @@ #ifndef TH_SIZE_INC #define TH_SIZE_INC -#include "THGeneral.h" +#include #include // THTensor functions that would work on a THSize if we had such a class in C++, diff --git a/aten/src/TH/THStorageFunctions.h b/aten/src/TH/THStorageFunctions.h index ce53827b9f6fc..edd1b6b80c93f 100644 --- a/aten/src/TH/THStorageFunctions.h +++ b/aten/src/TH/THStorageFunctions.h @@ -1,6 +1,6 @@ #pragma once -#include "THGeneral.h" +#include #include "THAllocator.h" #define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME) diff --git a/aten/src/TH/THVector.h b/aten/src/TH/THVector.h index 8054f648e8a4c..e5684dd5be7b7 100644 --- a/aten/src/TH/THVector.h +++ b/aten/src/TH/THVector.h @@ -1,7 +1,7 @@ #ifndef TH_VECTOR_INC #define TH_VECTOR_INC -#include "THGeneral.h" +#include #include "THMath.h" #define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME) diff --git a/aten/src/TH/vector/AVX.cpp b/aten/src/TH/vector/AVX.cpp index b39b803c86c69..507eb2276ecf7 100644 --- a/aten/src/TH/vector/AVX.cpp +++ b/aten/src/TH/vector/AVX.cpp @@ -6,7 +6,7 @@ #endif #include "AVX.h" -#include "THGeneral.h" +#include void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) { ptrdiff_t i; diff --git a/aten/src/TH/vector/AVX.h b/aten/src/TH/vector/AVX.h index 6fd183c6efd90..c4e47cbfcfc00 100644 --- a/aten/src/TH/vector/AVX.h +++ b/aten/src/TH/vector/AVX.h @@ -1,7 +1,7 @@ #ifndef TH_AVX_H #define TH_AVX_H -#include "THGeneral.h" +#include #include TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n); diff --git a/aten/src/TH/vector/AVX2.h b/aten/src/TH/vector/AVX2.h index 1c281d8c50e50..c78cc49d2a9ed 100644 --- a/aten/src/TH/vector/AVX2.h +++ b/aten/src/TH/vector/AVX2.h @@ -1,7 +1,7 @@ #ifndef TH_AVX2_H #define TH_AVX2_H -#include "THGeneral.h" +#include #include #include diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in index 427591684ea1f..3ba4ed9719b5e 100644 --- a/aten/src/THC/THCGeneral.h.in +++ b/aten/src/THC/THCGeneral.h.in @@ -1,7 +1,7 @@ #ifndef THC_GENERAL_INC #define THC_GENERAL_INC -#include "THGeneral.h" +#include #include "THAllocator.h" #include "THCThreadLocal.h" #undef log10 From bbb30ad4ab86f06f90408fdba2d311088be7d55d Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Fri, 20 Jul 2018 09:41:47 -0700 Subject: [PATCH 03/23] Use THTensor/Storage for THVoidTensor/Storage (#9588) Summary: Change akin to change for THVoidStorage. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9588 Reviewed By: gchanan Differential Revision: D8915559 Pulled By: cpuhrsch fbshipit-source-id: 6cc69df0e29942c62750f990903dfd8e4d344581 --- torch/csrc/Types.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/torch/csrc/Types.h b/torch/csrc/Types.h index 20e5ceec5bdc7..8342c7a94b0aa 100644 --- a/torch/csrc/Types.h +++ b/torch/csrc/Types.h @@ -12,18 +12,9 @@ template struct THPTypeInfo {}; namespace torch { -typedef THFloatStorage THVoidStorage; // all THXXXStorage types are the same. - -typedef struct THVoidTensor -{ - int64_t *size; - int64_t *stride; - int nDimension; - THVoidStorage *storage; - ptrdiff_t storageOffset; - int refcount; - char flag; -} THVoidTensor; +typedef THStorage THVoidStorage; + +typedef THTensor THVoidTensor; } // namespace torch From b9e89cf9fdf134affc184d36296bb6ac321aec91 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 20 Jul 2018 10:18:02 -0700 Subject: [PATCH 04/23] Revert "Extend DispatchStub to support CUDA dispatch (#9579)" (#9614) Summary: This reverts commit bcf0bf42a1727c8ee788f733c28579d0e36a387c. The commit was causing issues for some internal FB projects. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9614 Reviewed By: Yangqing Differential Revision: D8929552 Pulled By: colesbury fbshipit-source-id: ae9026ad8762a4c5de401273694b4c878fc241a6 --- .jenkins/pytorch/test.sh | 9 ++- aten/src/ATen/native/DispatchStub.cpp | 44 ---------- aten/src/ATen/native/ReduceOps.cpp | 11 +-- aten/src/ATen/native/SoftMax.cpp | 14 +--- aten/src/ATen/native/UnaryOps.cpp | 28 +------ .../CapabilityDispatch.h} | 80 ++++++++----------- aten/src/ATen/native/cpu/ReduceOpsKernel.h | 2 +- aten/src/ATen/native/cpu/SoftmaxKernel.h | 2 +- aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 2 +- aten/src/ATen/native/cpu/UnaryOpsKernel.h | 2 +- 10 files changed, 55 insertions(+), 139 deletions(-) delete mode 100644 aten/src/ATen/native/DispatchStub.cpp rename aten/src/ATen/native/{DispatchStub.h => cpu/CapabilityDispatch.h} (51%) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 7e5b98ee628cd..e4c553dd4e652 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -44,10 +44,13 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)") fi +export ATEN_DISABLE_AVX= +export ATEN_DISABLE_AVX2= if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then - export ATEN_CPU_CAPABILITY=default -elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then - export ATEN_CPU_CAPABILITY=avx + export ATEN_DISABLE_AVX=1 +fi +if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then + export ATEN_DISABLE_AVX2=1 fi test_python_nn() { diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp deleted file mode 100644 index 662ae580c599a..0000000000000 --- a/aten/src/ATen/native/DispatchStub.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "DispatchStub.h" - -#include - -#include -#include -#include - -namespace at { namespace native { - -static CPUCapability compute_cpu_capability() { - auto envar = std::getenv("ATEN_CPU_CAPABILITY"); - if (envar) { - if (strcmp(envar, "avx2") == 0) { - return CPUCapability::AVX2; - } - if (strcmp(envar, "avx") == 0) { - return CPUCapability::AVX; - } - if (strcmp(envar, "default") == 0) { - return CPUCapability::DEFAULT; - } - AT_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar); - } - -#ifndef __powerpc__ - if (cpuinfo_initialize()) { - if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) { - return CPUCapability::AVX2; - } - if (cpuinfo_has_x86_avx()) { - return CPUCapability::AVX; - } - } -#endif - return CPUCapability::DEFAULT; -} - -CPUCapability get_cpu_capability() { - static CPUCapability capability = compute_cpu_capability(); - return capability; -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index d055a91afa759..affa9d24059d9 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -17,9 +17,6 @@ namespace at { namespace native { -DispatchStub sum_kernel; -DispatchStub prod_kernel; - static inline Tensor integer_upcast(const Tensor& self, optional dtype) { ScalarType scalarType = self.type().scalarType(); ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType); @@ -130,7 +127,7 @@ Tensor sum(const Tensor &self) { Tensor _sum_cpu(const Tensor& self) { if (self.is_contiguous()) { Tensor result = at::empty({}, self.type()); - sum_kernel(kCPU, result, self, at::nullopt); + sum_kernel(result, self, at::nullopt); return result; } return self._sumall(); @@ -151,7 +148,7 @@ Tensor prod(const Tensor &self) { Tensor _prod_cpu(const Tensor &self) { if (self.is_contiguous()) { Tensor result = at::empty({}, self.type()); - prod_kernel(kCPU, result, self, at::nullopt); + prod_kernel(result, self, at::nullopt); return result; } return self._prodall(); @@ -225,7 +222,7 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, return result; if (self.is_contiguous() && result.is_contiguous()) { _dimreduce_setup(result, self, dim); - sum_kernel(kCPU, result, self, dim); + sum_kernel(result, self, dim); if (!keepdim) result.squeeze_(dim); return result; } @@ -263,7 +260,7 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, return result; if (self.is_contiguous() && result.is_contiguous()) { _dimreduce_setup(result, self, dim); - prod_kernel(kCPU, result, self, dim); + prod_kernel(result, self, dim); if (!keepdim) result.squeeze_(dim); return result; } diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 56e0ab6ca6ba1..546c75829a6d5 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -128,7 +128,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) { dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { - softmax_lastdim_kernel(kCPU, output, input); + softmax_lastdim_kernel(output, input); } else { AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] { host_softmax(output, input, dim); @@ -147,7 +147,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) { dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { - log_softmax_lastdim_kernel(kCPU, output, input); + log_softmax_lastdim_kernel(output, input); } else { AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] { host_softmax(output, input, dim); @@ -176,7 +176,7 @@ Tensor softmax_backward_cpu( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { - softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output); + softmax_backward_lastdim_kernel(grad_input, grad, output); } else { AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] { host_softmax_backward(grad_input, grad, output, dim); @@ -205,7 +205,7 @@ Tensor log_softmax_backward_cpu( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { - log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output); + log_softmax_backward_lastdim_kernel(grad_input, grad, output); } else { AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] { host_softmax_backward(grad_input, grad, output, dim); @@ -213,11 +213,5 @@ Tensor log_softmax_backward_cpu( } return grad_input; } - -DispatchStub softmax_lastdim_kernel; -DispatchStub log_softmax_lastdim_kernel; -DispatchStub softmax_backward_lastdim_kernel; -DispatchStub log_softmax_backward_lastdim_kernel; - } } diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 17f5a437b0015..dbfc623b0ccba 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -92,14 +92,14 @@ Tensor& fill_(Tensor& self, const Tensor& value) { Tensor& _##op##__cpu(Tensor& self_) { \ if (self_.numel() > 0) { \ Tensor self = sort_strides(self_); \ - op##Impl(kCPU, self, self); \ + op##Impl(self, self); \ } \ return self_; \ } \ Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \ result.resize_(self.sizes()); \ if (result.numel() > 0) { \ - op##Impl(kCPU, result, self); \ + op##Impl(result, self); \ } \ return result; \ } @@ -145,29 +145,5 @@ IMPLEMENT_UNARY_OP_VEC(tan) IMPLEMENT_UNARY_OP_VEC(tanh) IMPLEMENT_UNARY_OP_VEC(trunc) -DispatchStub absImpl; -DispatchStub acosImpl; -DispatchStub asinImpl; -DispatchStub atanImpl; -DispatchStub ceilImpl; -DispatchStub cosImpl; -DispatchStub erfImpl; -DispatchStub erfcImpl; -DispatchStub expImpl; -DispatchStub expm1Impl; -DispatchStub floorImpl; -DispatchStub logImpl; -DispatchStub log10Impl; -DispatchStub log1pImpl; -DispatchStub log2Impl; -DispatchStub roundImpl; -DispatchStub rsqrtImpl; -DispatchStub sigmoidImpl; -DispatchStub sinImpl; -DispatchStub sqrtImpl; -DispatchStub tanImpl; -DispatchStub tanhImpl; -DispatchStub truncImpl; - } } // namespace at diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/cpu/CapabilityDispatch.h similarity index 51% rename from aten/src/ATen/native/DispatchStub.h rename to aten/src/ATen/native/cpu/CapabilityDispatch.h index bbdf07a8458bf..6cb0f279872d6 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/cpu/CapabilityDispatch.h @@ -1,8 +1,8 @@ #pragma once -#include -#include +#include #include +#include // Implements instruction set specific function dispatch. // @@ -23,82 +23,72 @@ // REGISTER_DISPATCH(stub, &kernel); // // To call: -// stub(kCPU, tensor); +// stub(tensor); // namespace at { namespace native { -enum class CPUCapability { - DEFAULT = 0, - AVX = 1, - AVX2 = 2, - NUM_OPTIONS -}; - -CPUCapability get_cpu_capability(); +enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS }; template struct DispatchStub { static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); template - void operator()(Backend backend, ArgTypes... args) { - if (backend == Backend::CPU) { - if (!dispatch_ptr) { - dispatch_ptr = choose_cpu_impl(); - } - (*dispatch_ptr)(args...); - } else if (backend == Backend::CUDA) { - AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel"); - (*cuda_dispatch_ptr)(args...); - } else { - AT_ERROR("DispatchStub: unsupported backend", backend); + void operator()(ArgTypes... args) { + if (!dispatch_ptr) { + dispatch_ptr = choose_impl(); } + (*dispatch_ptr)(args...); } - FnPtr choose_cpu_impl() { - int def = static_cast(CPUCapability::DEFAULT); - int avx = static_cast(CPUCapability::AVX); - int avx2 = static_cast(CPUCapability::AVX2); - - auto capability = static_cast(get_cpu_capability()); - if (capability >= avx2 && table[avx2]) { - return table[avx2]; - } - if (capability >= avx && table[avx]) { - return table[avx]; + FnPtr choose_impl() { +// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc +#ifndef __powerpc__ + if (cpuinfo_initialize()) { + int avx2 = static_cast(CPUCapability::AVX2); + if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() && + cpuinfo_has_x86_fma3() && table[avx2]) { + return table[avx2]; + } + int avx = static_cast(CPUCapability::AVX); + if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) { + return table[avx]; + } } +#endif + int def = static_cast(CPUCapability::DEFAULT); AT_ASSERTM(table[def], "DispatchStub: missing default kernel"); return table[def]; } FnPtr dispatch_ptr = nullptr; - FnPtr cuda_dispatch_ptr = nullptr; FnPtr table[static_cast(CPUCapability::NUM_OPTIONS)]; }; -#if defined(CPU_CAPABILITY) || defined(__CUDACC__) +#if defined(CPU_CAPABILITY) -namespace { +constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY; -template +// Registers an implementation a kernel for the current CPU capability. +template struct RegisterDispatch { RegisterDispatch(DispatchStub& stub, FnPtr value) { -#if defined(__CUDACC__) - stub.cuda_dispatch_ptr = value; -#else - int cap = static_cast(CPUCapability::CPU_CAPABILITY); - AT_ASSERT(!stub.table[cap]) - stub.table[cap] = value; -#endif + stub.table[static_cast(CURRENT_CAPABILITY)] = value; } }; -} // anonymous namespace +// We only define the stub once in the DEFAULT capability compilation +#if defined(CPU_CAPABILITY_DEFAULT) +#define _DEFINE_STUB(stub, fn) DispatchStub stub +#else +#define _DEFINE_STUB(stub, fn) +#endif #define REGISTER_DISPATCH(stub, fn) \ + _DEFINE_STUB(stub, fn); \ static RegisterDispatch stub ## __register(stub, fn); #endif diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h index 5a7854d0094cd..9481b90fe7696 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include "CapabilityDispatch.h" namespace at { namespace native { diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h index 39d7e68a8b20c..dbd703b6d3c02 100644 --- a/aten/src/ATen/native/cpu/SoftmaxKernel.h +++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include "CapabilityDispatch.h" namespace at { namespace native { diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 459838a9b6c68..7416923cfd886 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -4,7 +4,7 @@ #include "ATen/Dispatch.h" #include "ATen/cpu/vml.h" #include "ATen/CPUApplyUtils.h" -#include "ATen/native/DispatchStub.h" +#include "ATen/native/cpu/CapabilityDispatch.h" #ifdef __AVX2__ #include "ATen/native/cpu/avx_mathfun.h" #endif diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h index d4845760f7248..d9bffadd1e1fb 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include "CapabilityDispatch.h" namespace at { namespace native { From 1d4d9fc7da4c09c64fa276a47032291e3d0f77b6 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Fri, 20 Jul 2018 10:49:39 -0700 Subject: [PATCH 05/23] Prepare to stop using attributes in the JIT (#9505) Summary: This PR adds machinery to cache the schema in an IR node, and allows lookups of (possibly) constant inputs by their names (instead of position). The new methods are: - `at::optional get(Symbol name)` - if the argument called name is a constant, then casts it to type `T` and returns it. If it's not constant returns `nullopt`. Raises an error if there's no argument with that name. - `at::optional get(Symbol name)` - like above, but packs the result in an IValue - `Value* getValue(Symbol name)` - retrieves a `Value*` for an argument (no need to know its position). All above functions currently inspect the attributes as well, but that's only so that I could start using them in other places in the JIT without disrupting our current functionality. I wanted this diff to be a preparation that doesn't change the semantics too much, and so both the tracer and script create nodes with attributes. The next PR will put that to a stop, and hopefully the changes we need to make to other components will be simpler thanks to what I did here. One more thing I'd like to do before actually stopping creating the non-attributed nodes is to have a convenient way of creating a schema programmatically, matching nodes against it, and creating them without having to pack inputs into flat argument lists (which is quite error prone). zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/9505 Reviewed By: ezyang Differential Revision: D8915496 Pulled By: apaszke fbshipit-source-id: 39d14fc9a9d73d8494f128367bf70357dbba83f5 --- test/expect/TestJit.test_alexnet.expect | 37 ++--- test/expect/TestJit.test_trace_size.expect | 16 +- .../TestJit.test_trace_size_with_grad.expect | 16 +- tools/jit/gen_jit_dispatch.py | 1 + torch/csrc/jit/autodiff.cpp | 53 ++++--- torch/csrc/jit/function_schema.h | 2 +- torch/csrc/jit/fusion_compiler.cpp | 24 +-- torch/csrc/jit/init.cpp | 2 + torch/csrc/jit/ir.cpp | 145 ++++++++++++++++++ torch/csrc/jit/ir.h | 48 +++++- torch/csrc/jit/passes/batch_mm.cpp | 10 +- .../common_subexpression_elimination.cpp | 8 +- torch/csrc/jit/passes/decompose_addmm.cpp | 11 +- torch/csrc/jit/passes/graph_fuser.cpp | 11 +- torch/csrc/jit/passes/peephole.cpp | 45 +++--- torch/csrc/jit/passes/remove_expands.cpp | 2 +- torch/csrc/jit/passes/shape_analysis.cpp | 31 ++-- torch/csrc/jit/symbolic_variable.h | 17 ++ torch/csrc/jit/tensor_conversions.h | 8 +- torch/csrc/jit/tracer.cpp | 5 +- torch/onnx/symbolic.py | 4 +- 21 files changed, 358 insertions(+), 138 deletions(-) diff --git a/test/expect/TestJit.test_alexnet.expect b/test/expect/TestJit.test_alexnet.expect index 0d3a5f53fd376..3c71802b9ff26 100644 --- a/test/expect/TestJit.test_alexnet.expect +++ b/test/expect/TestJit.test_alexnet.expect @@ -28,22 +28,23 @@ graph(%0 : Double(1, 3, 224, 224) %29 : Double(1, 256, 13, 13) = aten::_convolution[stride=[1, 1], padding=[1, 1], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%28, %9, %10), scope: AlexNet/Sequential[features]/Conv2d[10] %30 : Double(1, 256, 13, 13) = aten::threshold[threshold={0}, value={0}](%29), scope: AlexNet/Sequential[features]/ReLU[11] %31 : Double(1, 256, 6, 6), %32 : Long(1, 256, 6, 6) = aten::max_pool2d_with_indices[kernel_size=[3, 3], stride=[2, 2], padding=[0, 0], dilation=[1, 1], ceil_mode=0](%30), scope: AlexNet/Sequential[features]/MaxPool2d[12] - %33 : Long() = aten::size[dim=0](%31), scope: AlexNet - %34 : Long() = prim::Constant[value={9216}](), scope: AlexNet - %35 : Dynamic = aten::stack[dim=0](%33, %34), scope: AlexNet - %36 : Double(1, 9216) = aten::view(%31, %35), scope: AlexNet - %37 : Double(1, 9216) = ^Dropout(0.5, True, False)(%36), scope: AlexNet/Sequential[classifier]/Dropout[0] - %38 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1] - %39 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%12), scope: AlexNet/Sequential[classifier]/Linear[1] - %40 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%39, %37, %38), scope: AlexNet/Sequential[classifier]/Linear[1] - %41 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%40), scope: AlexNet/Sequential[classifier]/ReLU[2] - %42 : Double(1, 4096) = ^Dropout(0.5, True, False)(%41), scope: AlexNet/Sequential[classifier]/Dropout[3] - %43 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4] - %44 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%14), scope: AlexNet/Sequential[classifier]/Linear[4] - %45 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%44, %42, %43), scope: AlexNet/Sequential[classifier]/Linear[4] - %46 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%45), scope: AlexNet/Sequential[classifier]/ReLU[5] - %47 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6] - %48 : Double(1, 1000) = aten::expand[size=[1, 1000], implicit=1](%16), scope: AlexNet/Sequential[classifier]/Linear[6] - %49 : Double(1, 1000) = aten::addmm[beta={1}, alpha={1}](%48, %46, %47), scope: AlexNet/Sequential[classifier]/Linear[6] - return (%49); + %33 : Long() = prim::Constant[value={0}](), scope: AlexNet + %34 : Long() = aten::size(%31, %33), scope: AlexNet + %35 : Long() = prim::Constant[value={9216}](), scope: AlexNet + %36 : Dynamic = aten::stack[dim=0](%34, %35), scope: AlexNet + %37 : Double(1, 9216) = aten::view(%31, %36), scope: AlexNet + %38 : Double(1, 9216) = ^Dropout(0.5, True, False)(%37), scope: AlexNet/Sequential[classifier]/Dropout[0] + %39 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1] + %40 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%12), scope: AlexNet/Sequential[classifier]/Linear[1] + %41 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%40, %38, %39), scope: AlexNet/Sequential[classifier]/Linear[1] + %42 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%41), scope: AlexNet/Sequential[classifier]/ReLU[2] + %43 : Double(1, 4096) = ^Dropout(0.5, True, False)(%42), scope: AlexNet/Sequential[classifier]/Dropout[3] + %44 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4] + %45 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%14), scope: AlexNet/Sequential[classifier]/Linear[4] + %46 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%45, %43, %44), scope: AlexNet/Sequential[classifier]/Linear[4] + %47 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%46), scope: AlexNet/Sequential[classifier]/ReLU[5] + %48 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6] + %49 : Double(1, 1000) = aten::expand[size=[1, 1000], implicit=1](%16), scope: AlexNet/Sequential[classifier]/Linear[6] + %50 : Double(1, 1000) = aten::addmm[beta={1}, alpha={1}](%49, %47, %48), scope: AlexNet/Sequential[classifier]/Linear[6] + return (%50); } diff --git a/test/expect/TestJit.test_trace_size.expect b/test/expect/TestJit.test_trace_size.expect index 153104589ca5b..1c6fdcd6eba00 100644 --- a/test/expect/TestJit.test_trace_size.expect +++ b/test/expect/TestJit.test_trace_size.expect @@ -1,9 +1,11 @@ graph(%0 : Double(5, 2, 4)) { - %1 : Long() = aten::size[dim=1](%0) - %2 : Long() = aten::mul[other={2}](%1) - %3 : Long() = aten::size[dim=0](%0) - %4 : Long() = prim::Constant[value={2}]() - %5 : Dynamic = aten::stack[dim=0](%2, %3, %4) - %6 : Double(4, 5, 2) = aten::view(%0, %5) - return (%6); + %1 : Long() = prim::Constant[value={1}]() + %2 : Long() = aten::size(%0, %1) + %3 : Long() = aten::mul[other={2}](%2) + %4 : Long() = prim::Constant[value={0}]() + %5 : Long() = aten::size(%0, %4) + %6 : Long() = prim::Constant[value={2}]() + %7 : Dynamic = aten::stack[dim=0](%3, %5, %6) + %8 : Double(4, 5, 2) = aten::view(%0, %7) + return (%8); } diff --git a/test/expect/TestJit.test_trace_size_with_grad.expect b/test/expect/TestJit.test_trace_size_with_grad.expect index 153104589ca5b..1c6fdcd6eba00 100644 --- a/test/expect/TestJit.test_trace_size_with_grad.expect +++ b/test/expect/TestJit.test_trace_size_with_grad.expect @@ -1,9 +1,11 @@ graph(%0 : Double(5, 2, 4)) { - %1 : Long() = aten::size[dim=1](%0) - %2 : Long() = aten::mul[other={2}](%1) - %3 : Long() = aten::size[dim=0](%0) - %4 : Long() = prim::Constant[value={2}]() - %5 : Dynamic = aten::stack[dim=0](%2, %3, %4) - %6 : Double(4, 5, 2) = aten::view(%0, %5) - return (%6); + %1 : Long() = prim::Constant[value={1}]() + %2 : Long() = aten::size(%0, %1) + %3 : Long() = aten::mul[other={2}](%2) + %4 : Long() = prim::Constant[value={0}]() + %5 : Long() = aten::size(%0, %4) + %6 : Long() = prim::Constant[value={2}]() + %7 : Dynamic = aten::stack[dim=0](%3, %5, %6) + %8 : Double(4, 5, 2) = aten::view(%0, %7) + return (%8); } diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index d6458f9c2337e..75a14b2d5550f 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -127,6 +127,7 @@ def from_tensor(arg): auto result = torch::${name}(${args}, options); """) +# TODO (apaszke): remove the attributed codepath once we remove them CONSTRUCTOR = CodeTemplate("""\ [](Node *node) { ${kw_assignments} diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index fdeb0ef13a8c3..1b7784916910e 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -11,8 +11,10 @@ namespace torch { namespace jit { using value_map = std::unordered_map; using value_set = std::unordered_set; -bool hasOneValuedAttribute(Node *n, torch::jit::Symbol name) { - return n->hasAttribute(name) && at::Scalar(n->t(name)).toDouble() == 1.0; +bool hasOneValuedInput(Node *n, torch::jit::Symbol name) { + auto maybe_t = n->get(name); + if (!maybe_t) return false; + return at::Scalar(*maybe_t).toDouble() == 1.0; } bool isDifferentiable(Node * n) { @@ -28,7 +30,7 @@ bool isDifferentiable(Node * n) { if (n->kind() == aten::addmm) { if (n->inputs().size() > 3) return false; - if (!hasOneValuedAttribute(n, attr::alpha) || !hasOneValuedAttribute(n, attr::beta)) + if (!hasOneValuedInput(n, attr::alpha) || !hasOneValuedInput(n, attr::beta)) return false; } if (n->kind() == aten::type_as && !n->inputs().at(1)->isTensor()) { @@ -65,7 +67,8 @@ bool outputRequiresGrad(Node* node, std::function requires_grad) { case aten::eq: return false; case aten::type_as: - //type_as has two inputs, the second of which (setting type) might require grad, but it still won't affect the output of type_as requiring grad. + // type_as has two inputs, the second of which (setting type) might require grad, + // but it still won't affect the output of type_as requiring grad. return requires_grad(node->inputs().at(0)); default: return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad); @@ -80,23 +83,30 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val auto outputs = fmap(node->outputs()); switch(node->kind()) { case aten::add: - // o = a - alpha*other - if(inputs.size() == 1) + // TODO (apaszke): remove formulas for attributed nodes once they are removed + // o = self + alpha*other + if(inputs.size() == 1) { return { grads.at(0) }; - // o = a + alpha*b - return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha)) }; + } else if (node->hasAttribute(attr::alpha)) { + return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha))}; + } else { + return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)}; + } case aten::sub: - // o = a - alpha*other - if(inputs.size() == 1) + // o = self - alpha*other + if(inputs.size() == 1) { return {grads.at(0)}; - // o = a - alpha*b - return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))}; + } else if (node->hasAttribute(attr::alpha)) { + return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))}; + } else { + return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)}; + } case aten::mul: - // o = a * other + // o = self * other if(inputs.size() == 1) return {grads.at(0) * at::Scalar(node->t(attr::other))}; - // o = a * b - return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)}; + else + return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)}; case prim::Constant: return {}; case aten::sigmoid: @@ -109,17 +119,18 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val return {grads.at(0) * (outputs.at(0))}; case aten::chunk: case aten::split: - return {SymbolicVariable::cat(grads, node->i(attr::dim))}; + return {SymbolicVariable::cat(grads, node->getValue(attr::dim))}; case aten::t: return {grads.at(0).t()}; case aten::neg: return {-grads.at(0)}; case aten::view: + // TODO: if sizes are not available statically, add an operator that reutrns them as a tuple return {grads.at(0).view(inputs.at(0).sizes())}; case aten::type_as: return {grads.at(0).type_as(inputs.at(0))}; case aten::unsqueeze: - return {grads.at(0).squeeze(node->i(attr::dim))}; + return {grads.at(0).squeeze(node->getValue(attr::dim))}; case aten::mm: { SymbolicVariable dmat1, dmat2; if (auto type = inputs.at(0).value()->type()->cast()) { @@ -148,7 +159,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val const auto& input_sizes = inputs.at(0).sizes(); if (input_sizes.size() == 0) return {grads.at(0).sum()}; - auto grad_sizes = node->is(attr::size); + auto grad_sizes = node->get>(attr::size).value(); auto grad = grads.at(0); while (grad_sizes.size() > input_sizes.size()) { grad = grad.sum(0, false); @@ -163,6 +174,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } case aten::squeeze: { const auto& sizes = inputs.at(0).sizes(); + // TODO (apaszke): need to select the right overload here if (node->hasAttribute(attr::dim)) { int dim = node->i(attr::dim); return {sizes.at(dim) > 1 ? grads.at(0) : grads.at(0).unsqueeze(dim)}; @@ -179,11 +191,12 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } } case aten::cat: { - int dim = node->i(attr::dim); + int dim = node->get(attr::dim).value(); const auto& first_sizes = inputs.at(0).sizes(); const auto has_first_sizes = [&first_sizes](SymbolicVariable var) { return var.sizes() == first_sizes; }; + // TODO (apaszke): This will need an adjustment for the dim argument // NB: this is a specialization for the common case where all inputs are // of equal sizes. We can use a single split operation to handle that. if (std::all_of(inputs.begin(), inputs.end(), has_first_sizes)) { @@ -339,6 +352,8 @@ static ReverseDetails addReverseInline(Gradient& grad_desc, value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad)); JIT_ASSERT(grad_inputs.size() == node->inputs().size()); for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) { + if (!requires_grad(inputs[i])) continue; + JIT_ASSERT(grad_inputs[i]); set_grad(inputs[i], grad_inputs[i]); } } diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h index 13c81dc296cf5..99222e88af25f 100644 --- a/torch/csrc/jit/function_schema.h +++ b/torch/csrc/jit/function_schema.h @@ -1,6 +1,6 @@ #pragma once #include "ATen/ATen.h" -#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/type.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 78087f8d3118c..889a8c3fcfacc 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -3,6 +3,7 @@ #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/code_template.h" #include "torch/csrc/jit/resource_guard.h" +#include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/utils/disallow_copy.h" #include "torch/csrc/variable_tensor_functions.h" @@ -125,9 +126,9 @@ void ${kernelName}(IndexType totalElements, void ** args) { // This snippet enables half support in the jit. Following the pattern for // reductions, fp16 input data is immediately upconverted to float // with __half2float(). All mathematical operations are done on float -// values, and if needed the intermediate float representation is +// values, and if needed the intermediate float representation is // converted to half with __float2half() when writing to a half tensor. -constexpr auto half_support_literal = R"( +constexpr auto half_support_literal = R"( #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) #if defined(__cplusplus) @@ -169,7 +170,7 @@ size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes}; ${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride}; )"); -void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) { +static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) { TemplateEnv env; env.s("tensor",tensor); out << format("IndexType ${tensor}_offset = 0;\n",env); @@ -186,14 +187,14 @@ void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, b } } -std::string valueName(Value * n) { +static std::string valueName(Value * n) { return "n" + std::to_string(n->unique()); } -std::string scalarValue(const at::Tensor & t) { +static std::string scalarValue(const at::Tensor & t) { auto s = at::Scalar(t); - if (s.isIntegral()){ - return std::to_string(s.toLong()); + if (s.isIntegral()){ + return std::to_string(s.toLong()); } else { std::ostringstream out; out << std::scientific << s.toDouble() << "f"; @@ -201,7 +202,7 @@ std::string scalarValue(const at::Tensor & t) { } } -const char * scalarTypeName(at::ScalarType type) { +static const char * scalarTypeName(at::ScalarType type) { if (type == at::ScalarType::Half) { return "half"; } @@ -291,8 +292,9 @@ std::string encodeRHS(Node * n) { TemplateEnv env; size_t i = 0; for(auto in : n->inputs()) { - env.s(std::to_string(i++),valueName(in)); + env.s(std::to_string(i++), valueName(in)); } + // TODO (apaszke): remove once we get rid of attributes // ops like div have a / b or a / 2 with the constant having the attribute other // so we add other as an input if it is present // 'pow' is the same but uses exponent as the attribute, so we handle that here as well @@ -356,7 +358,7 @@ std::vector emitCompilationUnit(std::ostream & out, } else { auto cat = o->node(); size_t nInputs = cat->inputs().size(); - concat_desc.emplace_back(desc, nInputs, cat->i(attr::dim)); + concat_desc.emplace_back(desc, nInputs, cat->get(attr::dim).value()); for(auto c : cat->inputs()) { emitFormal(c, *concat_desc.back().subtensorDesc); flat_output_nodes.push_back(c); @@ -381,7 +383,7 @@ std::vector emitCompilationUnit(std::ostream & out, } else { env.s("access", format("t${formal}.data[t${formal}_offset]", env)); } - + //TODO: actual type propagation rather than relying on auto.. body << format("auto ${node} = ${access};\n",env); } diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index e1b9ac512cdb7..527a53795fb90 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -6,6 +6,7 @@ #include "torch/csrc/jit/python_arg_flatten.h" #include "torch/csrc/jit/export.h" #include "torch/csrc/jit/argument_spec.h" +#include "torch/csrc/jit/passes/remove_expands.h" #include "torch/csrc/jit/passes/graph_fuser.h" #include "torch/csrc/jit/passes/onnx.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" @@ -68,6 +69,7 @@ void initJITBindings(PyObject *module) { auto tensor_inputs = createVariableTensorList(inputs); PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs)); }) + .def("_jit_pass_remove_expands", RemoveExpands) .def("_jit_pass_erase_number_types", EraseNumberTypes) .def("_jit_pass_loop_unrolling", UnrollLoops) .def("_jit_run_cpp_tests", [] { diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index a340ddec6fc23..39de66680b9ca 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -1,5 +1,7 @@ #include "ir.h" +#include "torch/csrc/jit/tensor_conversions.h" +#include "torch/csrc/jit/operator.h" #include "torch/csrc/autograd/function.h" #include @@ -566,6 +568,149 @@ Value* Value::setUniqueName(const std::string & name) { return this; } +template +Value* Graph::insertConstant(T value) { + Node *n = create(prim::Constant); + insertNode(n); + auto t_value = as_tensor(value); + n->t_(attr::value, t_value.clone()); + n->output()->inferTypeFrom(t_value); + return n->output(); +} + +// This is necessary, because integral literals are of type int by default, +// and will dispatch to this function. +template<> +Value * Graph::insertConstant(int value) { + return insertConstant(static_cast(value)); +} + +template Value* Graph::insertConstant(int64_t value); +template Value* Graph::insertConstant(double value); +template Value* Graph::insertConstant(at::Tensor value); +template Value* Graph::insertConstant(at::IntList value); +template Value* Graph::insertConstant(at::Scalar value); + +namespace { + +// Of course any sane person would define this thing as a templated function, but +// it so happens that clang 3.8 has a pretty annoying bug which makes it complain that +// specializations are redefinitions of themselves, and so here we are. +template +struct getattr {}; + +template<> +struct getattr { + int64_t operator()(Node *n, Symbol name) { + return n->i(name); + } +}; + +template<> +struct getattr { + double operator()(Node *n, Symbol name) { + return n->f(name); + } +}; + +template<> +struct getattr { + at::Tensor operator()(Node *n, Symbol name) { + return n->t(name); + } +}; + +template<> +struct getattr> { + std::vector operator()(Node *n, Symbol name) { + return n->is(name); + } +}; + +} // anonymous namespace + +template +at::optional Node::get(Symbol name) { + // TODO (apaszke): remove. this is in here for now just so that we can ensure + // we always use this in places where the node has a valid schema already + // (will make next commits easier). + if (!schema_) findSchema(); + // TODO (apaszke): remove once tracer and compiler stop emitting attributes + if (hasAttributes()) { + // If it has an attribute, then it is a constant. If it's missing, it means we're + // doing an invalid lookup and it should throw anyway. + return getattr()(this, name); + } + auto inp = findInput(name); + Node *producer = inp.first->node(); + if (producer->kind() != prim::Constant) return at::nullopt; + auto value = producer->t(attr::value); + return tensor_as(std::move(value)); +} + +template at::optional Node::get(Symbol name); +template at::optional Node::get(Symbol name); +template at::optional Node::get(Symbol name); +template at::optional> Node::get(Symbol name); + +at::optional Node::get(Symbol name) { + // TODO (apaszke): remove once tracer and compiler stop emitting attributes + if (hasAttributes()) { + throw std::runtime_error("IValue Node::get() not implemented for the attribute case"); + } + auto inp = findInput(name); + Node * producer = inp.first->node(); + if (producer->kind() != prim::Constant) return at::nullopt; + auto value = producer->t(attr::value); + const Argument & arg = inp.second; + if (arg.type->isSubtypeOf(*DynamicType::get())) { + return IValue{std::move(value)}; + } else if (arg.type->isSubtypeOf(*IntType::get())) { + return IValue{tensor_as(std::move(value))}; + } else if (arg.type->isSubtypeOf(*FloatType::get())) { + return IValue{tensor_as(std::move(value))}; + } + throw std::runtime_error("Unsupported case in Node::get! File a bug report."); +} + +Value* Node::getValue(Symbol name) { + // TODO (apaszke): remove once tracer and compiler stop emitting attributes + if (hasAttribute(name)) { + switch (kindOf(name)) { + case AttributeKind::i: + return owningGraph()->insertConstant(i(name)); + case AttributeKind::is: + return owningGraph()->insertConstant(is(name)); + case AttributeKind::t: + return owningGraph()->insertConstant(t(name)); + default: + throw std::runtime_error("getValue() NYI"); + } + } + return findInput(name).first; +} + +std::pair Node::findInput(Symbol name) { + if (!schema_) { + findSchema(); + } + auto name_str = name.toUnqualString(); + size_t input_i = 0; + for (size_t i = 0; i < schema_->arguments.size(); ++i) { + const auto & arg = schema_->arguments[i]; + if (hasAttributeS(arg.name)) continue; + if (arg.name == name_str) { + return std::pair(input(input_i), arg); + } + input_i++; + } + throw std::runtime_error(std::string("Couldn't find an argument called ") + name.toQualString()); +} + +void Node::findSchema() { + schema_ = &getOperatorFor(this).schema; +} + PythonOp* defaultAllocPythonOp(Graph*g) { throw std::runtime_error("Trying to allocate a Python object without python bindings loaded"); } diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 2b555029f16b5..3595b83aefda6 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -6,6 +6,8 @@ #include "torch/csrc/jit/interned_strings.h" #include "torch/csrc/jit/resource_guard.h" #include "torch/csrc/jit/source_location.h" +#include "torch/csrc/jit/function_schema.h" +#include "torch/csrc/jit/ivalue.h" #include "torch/csrc/jit/type.h" #include "torch/csrc/utils/disallow_copy.h" @@ -176,11 +178,7 @@ struct Value { std::string unique_name_; TypePtr type_; public: - Value* setType(const TypePtr type) { - JIT_ASSERT(type); - type_ = type; - return this; - } + Value* setType(const TypePtr type); void inferTypeFrom(const at::Tensor& output) { setType(std::make_shared(output)); } @@ -287,6 +285,11 @@ struct Node : public Attributes { std::shared_ptr source_location_; size_t stage_; Scope* scope_; + // Assumes FunctionSchemas are persistent, so we don't manage their lifetime. + // This field is effective a cache that's populated on attribute lookups and + // invalidated every time we perform an operation that could potentially change + // the schema. + const FunctionSchema* schema_; protected: Node(Graph * graph_, NodeKind kind_); //defined after graph public: @@ -370,7 +373,7 @@ struct Node : public Attributes { outputs()[i]->replaceAllUsesWith(n->outputs()[i]); } } - // lots of things like chunk have a single input or singel output, so we have a + // lots of things like chunk have a single input or single output, so we have a // helper to make accessing it easier Value * input() { JIT_ASSERT(inputs_.size() == 1); @@ -392,6 +395,11 @@ struct Node : public Attributes { return inputs_.at(i); } + template + at::optional get(Symbol name); + at::optional get(Symbol name); + Value* getValue(Symbol name); + // Graphs // Note [Topological invariant] @@ -414,6 +422,7 @@ struct Node : public Attributes { // Result: %3 = f(%1, %2, %4) Value* addInput(Value * node) { JIT_ASSERT(graph_ == node->owningGraph()); + schema_ = nullptr; node->uses_.emplace_back(this, inputs_.size()); inputs_.push_back(node); return node; @@ -423,6 +432,7 @@ struct Node : public Attributes { // arguments. Returns the added node for ease of chaining. Value* insertInput(size_t i, Value* node) { JIT_ASSERT(graph_ == node->owningGraph()); + schema_ = nullptr; // First we update the offsets for all existing inputs that will reside // after the one we're inserting. Concretely, these are the inputs at // indices [i, # input). Since we're inserting one input before all of @@ -447,6 +457,7 @@ struct Node : public Attributes { // Result: %3 = f(%1, %4) Value * replaceInput(size_t i, Value * newValue) { JIT_ASSERT(newValue->owningGraph() == graph_); + schema_ = nullptr; Value * old = dropInput(i); inputs_[i] = newValue; newValue->uses_.emplace_back(this, i); @@ -462,6 +473,7 @@ struct Node : public Attributes { void replaceInputWith(Value * from, Value * to) { JIT_ASSERT(from->owningGraph() == graph_); JIT_ASSERT(to->owningGraph() == graph_); + schema_ = nullptr; size_t i = 0; for(auto input : inputs()) { if(input == from) @@ -472,10 +484,12 @@ struct Node : public Attributes { Value* addOutput() { outputs_.push_back(new Value(this, outputs_.size())); + schema_ = nullptr; return outputs_.back(); } Value* insertOutput(size_t i) { + schema_ = nullptr; outputs_.insert(outputs_.begin() + i, new Value(this, i)); for (size_t itr = i + 1; itr < outputs_.size(); ++itr) { outputs_[itr]->setOffset(outputs_[itr]->offset() + 1); @@ -585,6 +599,7 @@ struct Node : public Attributes { // Execute: %3.removeInput(1) // Result: %3 = f(%1) void removeInput(size_t i) { + schema_ = nullptr; dropInput(i); // everything after this input shifts left, // so we need to update their use offsets to match @@ -601,6 +616,7 @@ struct Node : public Attributes { // Execute: %3.removeAllInputs() // Result: %3 = f() void removeAllInputs() { + schema_ = nullptr; for(size_t i = 0; i < inputs().size(); ++i) dropInput(i); inputs_.clear(); @@ -643,6 +659,8 @@ struct Node : public Attributes { virtual ~Node() {} private: + std::pair findInput(Symbol name); + void findSchema(); // Lookup iterator in use list of _input i_ that corresponds to its use of _this_ use_list::iterator findUseForInput(size_t i) { auto & input_uses = inputs_[i]->uses_; @@ -996,6 +1014,9 @@ friend struct Block; return r; } + template + Value * insertConstant(T value); + Node * appendNode(Node * n) { return block_->appendNode(n); } @@ -1115,6 +1136,15 @@ inline Value::Value(Node * node_, size_t offset_) node_->graph_->all_values.emplace(this); } +inline Value* Value::setType(const TypePtr type) { + JIT_ASSERT(type); + type_ = type; + for (Use & use : uses_) { + use.user->schema_ = nullptr; + } + return this; +} + inline Graph * Value::owningGraph() { return node()->owningGraph(); } @@ -1142,13 +1172,15 @@ inline Node::Node(Graph * graph_, NodeKind kind_) : graph_(graph_), owning_block_(nullptr), stage_(graph_->new_node_stage_), - scope_(graph_->current_scope_) { + scope_(graph_->current_scope_), + schema_(nullptr) { graph_->all_nodes.emplace(this); } inline void Node::eraseOutput(size_t i) { JIT_ASSERT(i < outputs_.size()); JIT_ASSERT(outputs_[i]->uses().size() == 0); + schema_ = nullptr; Value * n = outputs_[i]; outputs_.erase(outputs_.begin() + i); owningGraph()->freeValue(n); @@ -1158,12 +1190,14 @@ inline void Node::eraseOutput(size_t i) { } inline Block * Node::addBlock() { + schema_ = nullptr; blocks_.push_back(new Block(owningGraph(), this)); return blocks_.back(); } inline void Node::eraseBlock(size_t i) { JIT_ASSERT(i < blocks_.size()); + schema_ = nullptr; Block * n = blocks_[i]; blocks_.erase(blocks_.begin() + i); n->destroy(); diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp index 15926fdab850a..0d182bd8fd37c 100644 --- a/torch/csrc/jit/passes/batch_mm.cpp +++ b/torch/csrc/jit/passes/batch_mm.cpp @@ -146,7 +146,9 @@ void BatchMMBlock(Block* block) { // Look for trees in the block std::unordered_map tokens; for (auto node : block->nodes()) { - if (node->kind() == aten::mm) { + if (node->kind() == aten::mm && + node->input(0)->type()->cast() && + node->input(1)->type()->cast()) { tokens[node] = TreeToken::fromMM(node); } else if (node->kind() == aten::add) { // NOTE: x + 2 is add[other={2}](%x) @@ -187,9 +189,9 @@ void BatchMMBlock(Block* block) { cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; }); - Node *cat = graph->create(aten::cat, inputs) - ->i_(attr::dim, cat_dim); - cat->insertBefore(root.node); + WithInsertPoint iguard { root.node }; + inputs.push_back(graph->insertConstant(cat_dim)); + Node *cat = graph->insertNode(graph->create(aten::cat, inputs)); cat->output()->setType(type->withSizes(cat_sizes)); return cat->output(); }; diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.cpp b/torch/csrc/jit/passes/common_subexpression_elimination.cpp index f6164024f2365..e05f986e2a8bd 100644 --- a/torch/csrc/jit/passes/common_subexpression_elimination.cpp +++ b/torch/csrc/jit/passes/common_subexpression_elimination.cpp @@ -87,21 +87,15 @@ struct EqualNodeCSE { if (lhs == nullptr && rhs == nullptr) return true; if (lhs == nullptr || rhs == nullptr) return false; - // Check whether two nodes are the same kind. if (lhs->kind() != rhs->kind()) return false; - - // Check the stage. if (lhs->stage() != rhs->stage()) return false; // Check whether the inputs are the same. auto lhs_inputs = lhs->inputs(); auto rhs_inputs = rhs->inputs(); - if (lhs_inputs.size() != rhs_inputs.size()) return false; - if (!std::equal(lhs_inputs.begin(), lhs_inputs.end(), rhs_inputs.begin())) return false; - // Check the attributes. if (!attributesEqualCSE(lhs, rhs)) return false; return true; @@ -117,7 +111,7 @@ void EliminateCommonSubexpression(Block * block) { for (auto it = block->nodes().begin(); it != block->nodes().end(); ++ it) { auto node = *it; if (node->kind() == prim::PythonOp - || node->kind() == prim::Eval + || node->kind() == prim::Print || node->blocks().size() > 0 ) { // Do NOT have enough information to do CSE on these nodes. diff --git a/torch/csrc/jit/passes/decompose_addmm.cpp b/torch/csrc/jit/passes/decompose_addmm.cpp index de9a3f666b7f7..85ad0770df77b 100644 --- a/torch/csrc/jit/passes/decompose_addmm.cpp +++ b/torch/csrc/jit/passes/decompose_addmm.cpp @@ -1,5 +1,6 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/symbolic_variable.h" +#include "torch/csrc/jit/tensor_conversions.h" namespace torch { namespace jit { @@ -14,12 +15,10 @@ static void DecomposeAddmm(Block* block) { // shape analysis and differentiation passes for those two individual ops. // Later, we will fuse together those two ops into a single addmm. if (it->kind() == aten::addmm && it->inputs().size() == 3) { - auto alpha = at::Scalar(it->t(attr::alpha)); - auto beta = at::Scalar(it->t(attr::beta)); - - if (alpha.to() != 1.0 || beta.to() != 1.0) { - continue; - } + auto alpha = it->get(attr::alpha); + auto beta = it->get(attr::beta); + if (!alpha || !beta) continue; + if (tensor_as(*alpha) != 1.0 || tensor_as(*beta) != 1.0) continue; WithInsertPoint guard(*it); diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index 712c88d0548b8..6b98e7f1a4695 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -5,7 +5,7 @@ #ifdef USE_CUDA #include "cuda.h" // for CUDA_VERSION -#endif +#endif namespace torch { namespace jit { @@ -136,13 +136,13 @@ struct GraphFuser { #ifdef USE_CUDA // Checks for half tensor on GPU // const auto device = tt->device(); - if (tt->device() != kCPUDevice + if (tt->device() != kCPUDevice && CUDA_VERSION >= 9 && tt->scalarType() == at::ScalarType::Half) { return true; } - #endif - } + #endif + } return false; } @@ -204,8 +204,9 @@ struct GraphFuser { if(isFusable(node)) return true; // this concat fusion only works when all the inputs are the same size + // and we can statically infer the dimension along which we should concat // otherwise they cannot partipate in the same map - if(node->kind() == aten::cat && allOutputsHaveSameSize(node)) + if(node->kind() == aten::cat && node->get(attr::dim) && allOutputsHaveSameSize(node)) return true; return false; diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp index 4d997bb8017a0..5fd438280a35e 100644 --- a/torch/csrc/jit/passes/peephole.cpp +++ b/torch/csrc/jit/passes/peephole.cpp @@ -1,5 +1,7 @@ #include "torch/csrc/jit/passes/peephole.h" + #include "torch/csrc/jit/symbolic_variable.h" +#include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" namespace torch { namespace jit { @@ -28,7 +30,7 @@ void PeepholeOptimize(Block * block) { if (!n->input()->isTensor()) break; // the sizes are dynamic if(n->inputs().size() != 1) break; - if (n->is(attr::size) == n->input()->type()->expect()->sizes()) { + if (n->get>(attr::size) == n->input()->type()->expect()->sizes()) { n->output()->replaceAllUsesWith(n->input()); // Let DCE clean up any unused nodes at this point } @@ -56,35 +58,24 @@ void PeepholeOptimize(Block * block) { } } } break; - // Fuse mm + add into addmm case aten::add: { - // Must have two inputs - if (n->inputs().size() != 2) { - continue; - } - // Alpha parameter must be 1.0 - auto alpha = at::Scalar(it->t(attr::alpha)); - if (alpha.to() != 1.0) { - continue; - } + // mm + add == addmm + if (n->inputs().size() == 2 && + n->get(attr::alpha) && + tensor_as(*n->get(attr::alpha)) == 1. && + n->input(1)->node()->kind() == aten::mm) { + WithInsertPoint guard(n); - auto input_node = n->input(1)->node(); - // Input must be an mm node - if (input_node->kind() != aten::mm) { - continue; - } + auto input_node = n->input(1)->node(); + SymbolicVariable mat(n->input(0)); + SymbolicVariable mat1(input_node->input(0)); + SymbolicVariable mat2(input_node->input(1)); + SymbolicVariable addmm_value = mat.addmm(mat1, mat2); - WithInsertPoint guard(n); - - SymbolicVariable mat(n->input(0)); - SymbolicVariable mat1(input_node->input(0)); - SymbolicVariable mat2(input_node->input(1)); - SymbolicVariable addmm_value = mat.addmm(mat1, mat2); - - // Copy shape information from output node - ((Value*)addmm_value)->copyMetadata(n->output()); - n->output()->replaceAllUsesWith(addmm_value); - // Let DCE clean up any unused nodes at this point + // Copy shape information from output node + ((Value*)addmm_value)->copyMetadata(n->output()); + n->output()->replaceAllUsesWith(addmm_value); + } } break; } } diff --git a/torch/csrc/jit/passes/remove_expands.cpp b/torch/csrc/jit/passes/remove_expands.cpp index 25286e9ed8f9a..8c679db8c2e52 100644 --- a/torch/csrc/jit/passes/remove_expands.cpp +++ b/torch/csrc/jit/passes/remove_expands.cpp @@ -7,7 +7,7 @@ static void RemoveExpands(Block* block) { ++it) { for (auto sub : it->blocks()) RemoveExpands(sub); - if (it->kind() == aten::expand && it->hasAttribute(attr::implicit) && it->i(attr::implicit)) { + if (it->kind() == aten::expand && it->get(attr::implicit) != static_cast(0)) { it->output()->replaceAllUsesWith(it->input()); it.destroyCurrent(); } diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index f8239c5a6457c..5fe1358a0010b 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -150,6 +150,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { } std::vector types; bool present; + // TODO (apaszke): This currently assumes that inputs are pretty much tensor-only. Need to fix that. std::tie(types, present) = gatherTypes(node->inputs()); if(!present) { return setDynamicType(node); @@ -171,10 +172,12 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { return true; }; + // TODO (apaszke): Those currently assume that a lot of stuff is constant, which might + // not be the case if they are produced by script. switch(node->kind()) { - //TODO: for expensive ops we can directly encode their shape propagation + // For expensive ops we can directly encode their shape propagation // here, otherwise we fallback to running a fake version of the op - // to get a quick and dirty propagation + // to get a quick and dirty propagation. case aten::add: case aten::sub: case aten::mul: @@ -236,12 +239,16 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { {AKind::i, attr::keepdim}})) { auto tp = types.at(0); auto sizes = tp->sizes(); - int64_t dim = node->is(attr::dim).at(0); - SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); - if (node->i(attr::keepdim)) { - sizes.at(dim) = 1; - } else { - sizes.erase(sizes.begin() + dim); + auto dims = node->get>(attr::dim).value(); + bool keepdim = node->get(attr::keepdim).value(); + std::reverse(dims.begin(), dims.end()); + for (int64_t dim : dims) { + SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); + if (keepdim) { + sizes.at(dim) = 1; + } else { + sizes.erase(sizes.begin() + dim); + } } node->output()->setType(tp->withSizes(sizes)); } else if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) { @@ -254,7 +261,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { auto tp = types.at(0); auto sizes = tp->sizes(); auto strides = tp->strides(); - int64_t dim = node->i(attr::dim); + int64_t dim = node->get(attr::dim).value(); SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); if (sizes.at(dim) == 1) { sizes.erase(sizes.begin() + dim); @@ -269,7 +276,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { auto tp = types.at(0); auto sizes = tp->sizes(); auto strides = tp->strides(); - int64_t dim = node->i(attr::dim); + int64_t dim = node->get(attr::dim).value(); SHAPE_ASSERT(dim >= 0 && static_cast(dim) <= sizes.size()); sizes.insert(sizes.begin() + dim, 1); strides.insert(strides.begin() + dim, 1); @@ -279,7 +286,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { case aten::view: { if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {{AKind::is, attr::size}})) { - auto sizes = node->is(attr::size); + auto sizes = node->get>(attr::size).value(); bool inferred = false; size_t inferred_idx; int64_t size_product = 1; @@ -314,7 +321,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { {{AKind::i, attr::dim}})) { auto ten = types.at(0); auto index = types.at(1); - int64_t dim = node->i(attr::dim); + int64_t dim = node->get(attr::dim).value(); SHAPE_ASSERT(index->sizes().size() == 1); SHAPE_ASSERT(dim >= 0 && static_cast(dim) < ten->sizes().size()); std::vector sizes = ten->sizes(); diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h index dfbeb3361b3d6..12417390478a4 100644 --- a/torch/csrc/jit/symbolic_variable.h +++ b/torch/csrc/jit/symbolic_variable.h @@ -52,6 +52,11 @@ struct SymbolicVariable { return (int64_t) i == s.toLong(); } } + // TODO (apaszke): Use this instead of attribute setters + template + SymbolicVariable insertConstant(T value) const { + return v->owningGraph()->insertConstant(std::move(value)); + } SymbolicVariable operator*(const SymbolicVariable rhs) const { return create(aten::mul, {*this, rhs})[0].typeLike(*this); } @@ -165,6 +170,13 @@ struct SymbolicVariable { ->i_(a("length"), length); return r; } + static SymbolicVariable cat(ArrayRef inputs, Value* dim) { + Node* n; + std::vector all_inputs = inputs; + all_inputs.push_back(dim); + auto r = create(aten::cat, all_inputs, 1, &n)[0]; + return r; + } static SymbolicVariable cat(ArrayRef inputs, int32_t dim) { Node* n; auto r = create(aten::cat, inputs, 1, &n)[0]; @@ -188,6 +200,11 @@ struct SymbolicVariable { ->i_(a("keepdim"), keepdim); return r; } + SymbolicVariable squeeze(Value* dim) const { + Node * n; + auto r = create(t("squeeze"), {*this, dim}, 1, &n)[0]; + return r; + } SymbolicVariable squeeze(int dim) const { Node * n; auto r = create(t("squeeze"), {*this}, 1, &n)[0]; diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h index 84162a445a910..36bfda79056a4 100644 --- a/torch/csrc/jit/tensor_conversions.h +++ b/torch/csrc/jit/tensor_conversions.h @@ -15,7 +15,7 @@ struct tensor_conversion_error : public std::runtime_error { }; template -inline T tensor_as(at::Tensor&& t); +inline T tensor_as(at::Tensor t); namespace detail { @@ -79,7 +79,7 @@ struct tensor_as_impl { } template -inline T tensor_as(at::Tensor&& t) { +inline T tensor_as(at::Tensor t) { return detail::tensor_as_impl()(std::move(t)); } @@ -109,6 +109,10 @@ inline at::Tensor as_tensor(const at::Scalar& s) { return s.toTensor(); } +inline at::Tensor as_tensor(at::Tensor t) { + return t; +} + template inline at::Tensor as_tensor(std::array&& bools) { auto r = at::empty({N}, at::kByte); diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index a86059f300495..e93b3b1aeadec 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -62,10 +62,9 @@ autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) { auto size_var = autograd::make_variable(at::Scalar(var.size(dim)).toTensor()); auto* value = getValueTrace(var); - auto* node = graph->create(aten::size, {value}) - ->i_(attr::dim, dim); + WithInsertPoint ipoint { graph->block() }; + auto* node = graph->insertNode(graph->create(aten::size, {value, graph->insertConstant(dim)})); node->output()->inferTypeFrom(size_var); - graph->appendNode(node); setValueTrace(size_var, node->output()); return size_var; diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index a88739c1cc490..0b72453cdc1e2 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -259,7 +259,9 @@ def embedding_bag(g, def size(g, self, dim): if _is_value(dim): - raise RuntimeError("ONNX export only supports constant dim values in .size()") + if dim.node().kind() != 'onnx::Constant': + raise RuntimeError("ONNX export only supports constant dim values in .size()") + dim = int(dim.node().t('value')) full_shape = g.op("Shape", self) return select(g, full_shape, dim=0, index=dim) From a3a6ab60cdb17845a4f46d56451638277559cc6a Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Fri, 20 Jul 2018 10:59:22 -0700 Subject: [PATCH 06/23] Fix the error in UnpackSegmentsOp when calculating the gradient with "max_length" argument (#9598) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9598 The "max_length" should be passed to UnPackSegmentsOp if "max_length" is given when calling PackSegmentsOp. Reviewed By: jerryzh168 Differential Revision: D8919799 fbshipit-source-id: 8c97aa717b69177b8a5d5d56892817d488853840 --- caffe2/operators/pack_segments.cc | 11 +++++++++-- caffe2/operators/pack_segments.cu | 7 ++++++- caffe2/operators/pack_segments.h | 7 +++++-- caffe2/python/operator_test/pack_ops_test.py | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc index 7fcbf917e28fa..2c2f3fdc4fafc 100644 --- a/caffe2/operators/pack_segments.cc +++ b/caffe2/operators/pack_segments.cc @@ -119,7 +119,12 @@ bool UnpackSegmentsOp::DoRunWithType2() { CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D"); CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D"); - + if (max_length_ != -1) { + CAFFE_ENFORCE_EQ( + max_length_, + data.dim(1), + "max_length should be equal to the second dimension of the packed segments"); + } const T* l = lengths.template data(); TIndex total_l = std::accumulate(l, l + lengths.dim(0), (TIndex)0); @@ -175,6 +180,7 @@ OPERATOR_SCHEMA(PackSegments) "presence_mask", "2 dim boolean tensor" ", false where packed_tensor is padded, true otherwise.") + .Arg("max_length", "The pre-defined max_length for the packed segments") .Arg( "pad_minf", "Padding number in the packed segments. Use true to pad \ @@ -191,7 +197,8 @@ OPERATOR_SCHEMA(UnpackSegments) "lengths", "1-d int/long tensor contains the length in each of the input.") .Input(1, "tensor", "N+1 dim Tensor.") - .Output(0, "packed_tensor", "N dim Tensor"); + .Output(0, "packed_tensor", "N dim Tensor") + .Arg("max_length", "The pre-defined max_length for the packed segments"); class GetPackSegmentsGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu index 374266cc66f52..8e4bdc49339c1 100644 --- a/caffe2/operators/pack_segments.cu +++ b/caffe2/operators/pack_segments.cu @@ -256,7 +256,12 @@ bool UnpackSegmentsOp::DoRunWithType2() { CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D"); CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D"); - + if (max_length_ != -1) { + CAFFE_ENFORCE_EQ( + max_length_, + data.dim(1), + "max_length should be equal to the packed segments"); + } // Compute prefix sum over the lengths array_prefix_sum_exclusive( lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_); diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h index 091d7852a013a..c35299d66d1a2 100644 --- a/caffe2/operators/pack_segments.h +++ b/caffe2/operators/pack_segments.h @@ -16,7 +16,6 @@ template class PackSegmentsOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - // USE_SIMPLE_CTOR_DTOR(PackSegmentsOp) USE_DISPATCH_HELPER; PackSegmentsOp(const OperatorDef& operator_def, Workspace* ws) @@ -62,9 +61,12 @@ template class UnpackSegmentsOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_SIMPLE_CTOR_DTOR(UnpackSegmentsOp) USE_DISPATCH_HELPER; + UnpackSegmentsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + max_length_(OperatorBase::GetSingleArgument("max_length", -1)) {} + bool RunOnDevice() override { return DispatchHelper>::call(this, Input(LENGTHS)); } @@ -78,6 +80,7 @@ class UnpackSegmentsOp final : public Operator { INPUT_TAGS(LENGTHS, DATA); private: + TIndex max_length_; Tensor dev_buffer_; Tensor dev_lengths_prefix_sum_; Tensor dev_max_length_; diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py index 97b2eaf99dd71..3935ca8c8f17a 100644 --- a/caffe2/python/operator_test/pack_ops_test.py +++ b/caffe2/python/operator_test/pack_ops_test.py @@ -101,6 +101,7 @@ def test_pack_with_max_length_ops(self, num_seq, cell_size, gc, dc): 'UnpackSegments', ['l', 't'], ['newd'], + max_length=max_length, device_option=gc)) assert(workspace.FetchBlob('t').shape[1] == max_length) assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all()) From d3688861ec8607af0e7bf4119911053f13aee500 Mon Sep 17 00:00:00 2001 From: vmirly Date: Fri, 20 Jul 2018 11:13:07 -0700 Subject: [PATCH 07/23] Fixed a missing '=' in LPPoolNd repr function (#9629) Summary: In the repr funciton of LPPoolNd(..) class, there was a missing '='. (`kernel_size{kernel_size}`) Link to line in the code: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/pooling.py#L694 Original: return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \ 'ceil_mode={ceil_mode}'.format(**self.__dict__) Fixed: return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \ 'ceil_mode={ceil_mode}'.format(**self.__dict__) Pull Request resolved: https://github.com/pytorch/pytorch/pull/9629 Differential Revision: D8932913 Pulled By: soumith fbshipit-source-id: 9030dff6b14659b5c7b6992d87ef53ec8891f674 --- torch/nn/modules/pooling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index 1f00b1cb10977..75ab843e5413b 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -691,7 +691,7 @@ def __init__(self, norm_type, kernel_size, stride=None, ceil_mode=False): self.ceil_mode = ceil_mode def extra_repr(self): - return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \ + return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \ 'ceil_mode={ceil_mode}'.format(**self.__dict__) From bae156a481351a7f2d0f5f2dbced5dfab2c58f19 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 20 Jul 2018 11:27:59 -0700 Subject: [PATCH 08/23] Support (some) CUDA Lapack on n-dimensional empty tensors. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9631 Reviewed By: ezyang Differential Revision: D8933202 Pulled By: gchanan fbshipit-source-id: 1ade4ca439bf26aa921df1da83a827d860f8f48f --- aten/src/THC/generic/THCTensorMathMagma.cu | 92 ++++++++++++---------- test/test_torch.py | 4 +- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index 0de79233c122b..c942a17c4c0df 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -26,7 +26,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self) { - THAssert(self->_dim() == 2); + THAssert(self->dim() == 2); size_t len = THCTensor_(nElement)(state, self)*sizeof(real); THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1); THCTensor *selfc = THCTensor_(newContiguous)(state, temp); @@ -39,7 +39,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src) { - THAssert(src->_dim() == 2); + THAssert(src->dim() == 2); if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0)) { THCTensor_(retain)(state, self); @@ -194,46 +194,47 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a); real *input_data = THCTensor_(data)(state, input); - // eigen values and workspace - real *w = th_magma_malloc_pinned(n); - real *wA = th_magma_malloc_pinned(lda * n); + if (n > 0) { + // eigen values and workspace + real *w = th_magma_malloc_pinned(n); + real *wA = th_magma_malloc_pinned(lda * n); - // compute optimal size of work array - int info; - real lwork; - int liwork; + // compute optimal size of work array + int info; + real lwork; + int liwork; #if defined(THC_REAL_IS_FLOAT) - magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); #else - magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info); #endif - real *work = th_magma_malloc_pinned((size_t)lwork); - int *iwork = th_magma_malloc_pinned(liwork); + real *work = th_magma_malloc_pinned((size_t)lwork); + int *iwork = th_magma_malloc_pinned(liwork); // compute eigenvalues and, optionally, eigenvectors #if defined(THC_REAL_IS_FLOAT) - magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); + magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); #else - magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); + magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info); #endif - // copy eigen values from w to re_ - if (info == 0) - THCTensor_(copyArray1d)(state, re_, w, n); - - magma_free_pinned(iwork); - magma_free_pinned(work); - magma_free_pinned(wA); - magma_free_pinned(w); + // copy eigen values from w to re_ + if (info == 0) + THCTensor_(copyArray1d)(state, re_, w, n); - // check error value - if (info > 0) - THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); - else if (info < 0) - THError("MAGMA syev : Argument %d : illegal value", -info); + magma_free_pinned(iwork); + magma_free_pinned(work); + magma_free_pinned(wA); + magma_free_pinned(w); + // check error value + if (info > 0) + THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA syev : Argument %d : illegal value", -info); + } THCTensor_(freeCopyTo)(state, input, rv_); #else THError(NoMagma(syev)); @@ -243,7 +244,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs) { #ifdef USE_MAGMA - THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional"); + THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional"); THArgCheck(a_->size(0) == a_->size(1), 3, "A should be square"); magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec; @@ -263,34 +264,39 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T ldvr = n; } - real wkopt; - int info; + real *work_data = nullptr; + if (n > 0) { + int info; + real wkopt; #if defined(THC_REAL_IS_FLOAT) - magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); #else - magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info); #endif - int lwork = (int) wkopt; - real *work_data = th_magma_malloc_pinned(lwork); + int lwork = (int) wkopt; + work_data = th_magma_malloc_pinned(lwork); #if defined(THC_REAL_IS_FLOAT) - magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); + magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); #else - magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); + magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info); #endif - if (info > 0) - THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); - else if (info < 0) - THError("MAGMA geev : Argument %d : illegal value", -info); + if (info > 0) + THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info); + else if (info < 0) + THError("MAGMA geev : Argument %d : illegal value", -info); + } { THCTensor_(resize2d)(state, re_, 2, n); THCTensor *re = THCTensor_(newContiguous)(state, re_); - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice)); - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); + if (n > 0) { + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); + } THCTensor_(freeCopyTo)(state, re, re_); THCTensor_(transpose)(state, re_, NULL, 0, 1); } diff --git a/test/test_torch.py b/test/test_torch.py index bf8f910252959..d1370e30cb826 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6320,9 +6320,7 @@ def test_lapack_empty(self): # numpy/sci often has a direct wrapper (e.g. lu_factor) and a wrapper that "does the right thing" # (e.g. lu). We often name our functions identically to the lapack function, so it will take work # to name / migrate-to better wrappers. - - # FIXME: enable CUDA tests. - devices = ['cpu'] # if not torch.cuda.is_available() else ['cpu', 'cuda'] + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: def fn(torchfn, *args): From 8a0fe0a5889bd8c7404102dfd007bc4c2ffc3512 Mon Sep 17 00:00:00 2001 From: Kittipat Virochsiri Date: Fri, 20 Jul 2018 11:45:41 -0700 Subject: [PATCH 09/23] set_input_record() should always add external input (#9636) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9636 Make sure that the blobs are registered to the net Reviewed By: pjh5 Differential Revision: D8924883 fbshipit-source-id: f09422a2d4d5ba8bf6cfbfd00172097b5ab1fcd6 --- caffe2/python/core.py | 7 ++++--- caffe2/python/core_test.py | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 9fef5724ad2f6..1423cfea3a6c0 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -1947,9 +1947,10 @@ def set_input_record(self, input_record): self._input_record = schema.NewRecord(self, input_record) else: self._input_record = input_record - for blob in input_record.field_blobs(): - if blob not in self.external_inputs: - self.AddExternalInput(blob) + + for blob in self._input_record.field_blobs(): + if blob not in self.external_inputs: + self.AddExternalInput(blob) return self._input_record def recover_input_record_by_prefix(self, prefix): diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 2a4afc82c8067..b7099d20eae86 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -9,7 +9,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace, test_util +from caffe2.python import core, workspace, schema, test_util from caffe2.python.task import Node, Task @@ -199,6 +199,24 @@ def net_assert(net, num_ops, inputs, outputs, internals): n._CheckLookupTables() +class TestExternalInputs(test_util.TestCase): + def testSetInputRecordWithBlobs(self): + net = core.Net("test") + record = schema.NewRecord(net, schema.Struct( + ("x", schema.Scalar(np.float)), + )) + input_record = net.set_input_record(record) + self.assertTrue(net.BlobIsDefined(input_record.x())) + self.assertIn(input_record.x(), net.external_inputs) + + def testSetInputRecordWithoutBlobs(self): + net = core.Net("test") + record = schema.Struct(("x", schema.Scalar(np.float))) + input_record = net.set_input_record(record) + self.assertTrue(net.BlobIsDefined(input_record.x())) + self.assertIn(input_record.x(), net.external_inputs) + + class TestCreateOperator(test_util.TestCase): def testCreate(self): device_option = caffe2_pb2.DeviceOption() From 1003ccfa15e944251a65ba2289f25e8f1ed14a46 Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Fri, 20 Jul 2018 12:42:48 -0700 Subject: [PATCH 10/23] Creates CUDAContext (#9435) Summary: ezyang noticed that the CUDAStream files lived under ATen/ despite being CUDA-specific, and suggested porting them to ATen/cuda and exposing them with a new CUDAContext. This PR does that. It also: - Moves ATen's CUDA-specific exceptions for ATen/cudnn to ATen/cuda for consistency - Moves getDeviceProperties() and getCurrentCUDASparseHandle() to CUDAContext from CUDAHooks The separation between CUDAContext and CUDAHooks is straightforward. Files that are in CUDA-only builds should rely on CUDAContext, while CUDAHooks is for runtime dispatch in files that can be included in CPU-only builds. A comment in CUDAContext.h explains this pattern. Acquiring device properties and CUDA-specific handles is something only done in builds with CUDA, for example, so I moved them from CUDAHooks to CUDAContext. This PR will conflict with #9277 and I will merge with master after #9277 goes in. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9435 Reviewed By: soumith Differential Revision: D8917236 Pulled By: ezyang fbshipit-source-id: 219718864234fdd21a2baff1dd3932ff289b5751 --- aten/src/ATen/CUDAGuard.h | 110 ---------- aten/src/ATen/CUDAStream.cpp | 197 ----------------- aten/src/ATen/CUDAStream.h | 109 ---------- aten/src/ATen/Context.h | 52 ----- aten/src/ATen/cuda/CUDAApplyUtils.cuh | 33 +-- aten/src/ATen/cuda/CUDAContext.cpp | 76 +++++++ aten/src/ATen/cuda/CUDAContext.h | 70 +++++++ aten/src/ATen/cuda/CUDAGuard.h | 108 ++++++++++ aten/src/ATen/cuda/CUDAStream.cpp | 198 ++++++++++++++++++ aten/src/ATen/cuda/CUDAStream.h | 108 ++++++++++ aten/src/ATen/cuda/Exceptions.h | 20 ++ aten/src/ATen/cuda/detail/CUDAHooks.cpp | 42 +--- aten/src/ATen/cuda/detail/CUDAHooks.h | 5 - aten/src/ATen/cudnn/Descriptors.h | 5 +- aten/src/ATen/cudnn/Exceptions.h | 17 -- aten/src/ATen/cudnn/Handles.cpp | 2 +- aten/src/ATen/cudnn/Utils.h | 1 + aten/src/ATen/detail/CUDAHooksInterface.cpp | 26 --- aten/src/ATen/detail/CUDAHooksInterface.h | 40 +--- aten/src/ATen/native/cuda/CuFFTPlanCache.h | 3 +- aten/src/ATen/native/cuda/Embedding.cu | 6 +- aten/src/ATen/native/cuda/EmbeddingBag.cu | 7 +- aten/src/ATen/native/cuda/Gesv.cu | 3 +- aten/src/ATen/native/cuda/RoiPooling.cu | 5 +- aten/src/ATen/native/cuda/SoftMax.cu | 7 +- aten/src/ATen/native/cuda/SpectralOps.cu | 5 +- aten/src/ATen/native/cuda/SummaryOps.cu | 6 +- .../ATen/native/cuda/TensorTransformations.cu | 5 +- aten/src/ATen/native/cuda/Unique.cu | 4 +- .../ATen/native/cudnn/AffineGridGenerator.cpp | 1 + aten/src/ATen/native/cudnn/BatchNorm.cpp | 1 + aten/src/ATen/native/cudnn/Conv.cpp | 1 + aten/src/ATen/native/cudnn/GridSampler.cpp | 1 + aten/src/ATen/native/cudnn/RNN.cpp | 1 + .../ATen/native/sparse/cuda/SparseCUDABlas.cu | 5 +- .../native/sparse/cuda/SparseCUDATensor.cu | 3 +- .../sparse/cuda/SparseCUDATensorMath.cu | 7 +- aten/src/ATen/test/stream_test.cpp | 109 +++++----- aten/src/THC/THCCachingAllocator.cpp | 5 +- aten/src/THC/THCGeneral.cpp | 18 +- aten/src/THC/THCStream.cpp | 16 +- test/cpp_extensions/cudnn_extension.cpp | 2 +- torch/csrc/autograd/functions/comm.cpp | 2 +- torch/csrc/autograd/functions/comm.h | 4 +- torch/csrc/autograd/profiler.h | 3 +- torch/csrc/cuda/Module.cpp | 3 +- torch/csrc/cuda/comm.cpp | 8 +- torch/csrc/cuda/comm.h | 3 +- torch/csrc/cuda/python_comm.cpp | 6 +- torch/csrc/jit/fusion_compiler.cpp | 3 +- 50 files changed, 742 insertions(+), 730 deletions(-) create mode 100644 aten/src/ATen/cuda/CUDAContext.cpp create mode 100644 aten/src/ATen/cuda/CUDAContext.h create mode 100644 aten/src/ATen/cuda/CUDAGuard.h create mode 100644 aten/src/ATen/cuda/CUDAStream.cpp create mode 100644 aten/src/ATen/cuda/CUDAStream.h create mode 100644 aten/src/ATen/cuda/Exceptions.h diff --git a/aten/src/ATen/CUDAGuard.h b/aten/src/ATen/CUDAGuard.h index 8027084caa7c3..e69de29bb2d1d 100644 --- a/aten/src/ATen/CUDAGuard.h +++ b/aten/src/ATen/CUDAGuard.h @@ -1,110 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include - -namespace at { - -/// A variant of `DeviceGuard` that augments it with an understanding of CUDA -/// streams. This guard can not only set and reset the current CUDA device, but -/// also set and reset the current CUDA stream. It is important to note that -/// because a CUDA stream is intrinsically associated with the CUDA device to -/// which it is bound, setting the CUDA stream *also* sets the current CUDA -/// device to that of the stream. -struct CUDAGuard { - /// Default constructor, does nothing and causes no change in the current - /// stream or device until `set_stream` or `set_device` is called. - CUDAGuard() = default; - - /// Sets the CUDA stream and its associated device as the current one (calls - /// `set_stream`). - explicit CUDAGuard(const CUDAStream& stream) { - set_stream(stream); - } - - /// Calls `set_device` with the given index. - explicit CUDAGuard(int32_t device) { - set_device(device); - } - - CUDAGuard(const CUDAGuard&) = delete; - CUDAGuard& operator=(const CUDAGuard&) = delete; - - /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The - /// moved-from `CUDAGuard` is modified such that its destruction has no - /// effect (does not reset the stream or device). - CUDAGuard(CUDAGuard&& other) noexcept = default; - - /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The - /// moved-from `CUDAGuard` is modified such that its destruction has no - /// effect (does not reset the stream or device). - CUDAGuard& operator=(CUDAGuard&& other) { - device_guard_ = std::move(other.device_guard_); - original_streams_ = std::move(other.original_streams_); - other.original_streams_.clear(); - return *this; - } - - /// Resets the CUDA stream on each device to the one that was active upon - /// construction. - ~CUDAGuard() { - if (!original_streams_.empty()) { - for (size_t device = 0; device < original_streams_.size(); ++device) { - globalContext().uncheckedSetCurrentCUDAStreamOnDevice( - device, original_streams_[device]); - } - } - } - - /// Sets the current CUDA device to the device associated with the given - /// stream, and then sets the current stream on that device to the one given. - void set_stream(const CUDAStream& stream) { - device_guard_.set_index(stream.device()); - // If we haven't stored the current stream yet, store it now. - if (original_streams_.empty()) { - const size_t device_count = globalContext().getNumGPUs(); - original_streams_.reserve(device_count); - for (size_t device = 0; device < device_count; ++device) { - original_streams_.push_back( - globalContext().getCurrentCUDAStreamOnDevice(device)); - } - } - globalContext().setCurrentCUDAStreamOnDevice( - device_guard_.last_index(), stream); - } - - /// Sets the CUDA device to the given one. - void set_device(int32_t device) { - device_guard_.set_index(device); - } - - /// Returns the CUDA streams that were active in the first call to - /// `set_stream`. If there was no such call, the returned container is - /// empty. - ArrayRef original_streams() const noexcept { - return original_streams_; - } - - /// Returns the device that was set upon construction of the guard. - int32_t original_device() const noexcept { - return device_guard_.original_index(); - } - - /// Returns the last device that was set via `set_device`, if any. - int32_t last_device() const noexcept { - return device_guard_.last_index(); - } - - private: - /// The guard for the current device. - DeviceGuard device_guard_; - /// The original streams that were active on all devices. - std::vector original_streams_; -}; - -} // namespace at diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp index b8b8d588ffbfc..e69de29bb2d1d 100644 --- a/aten/src/ATen/CUDAStream.cpp +++ b/aten/src/ATen/CUDAStream.cpp @@ -1,197 +0,0 @@ - #include "ATen/CUDAStream.h" -#include "ATen/Error.h" -#include "ATen/detail/CUDAHooksInterface.h" - -#include - -// Internal implementation is entirely hidden -struct CUDAStreamInternals { - bool is_destructible; - std::atomic refcount; - int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t - cudaStream_t stream; -}; - -namespace at { - -namespace detail { - - /* - * Stream state - */ - static constexpr cudaStream_t DEFAULT_STREAM = 0; - - static std::once_flag init_flag; - static int64_t num_gpus; - static CUDAStreamInternals* default_streams; - static thread_local CUDAStreamInternals** current_streams = nullptr; - - // Creates a(n indestructible) default stream for each device - // Note: the default stream on each device is signified by a zero - // value for the pointer, and so is not actually created as usual. - // In particular, we don't need to switch devices when creating the - // streams. - static void initDefaultCUDAStreams() { - num_gpus = getCUDAHooks().getNumGPUs(); - default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals)); - for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { - default_streams[i].is_destructible = false; - default_streams[i].refcount = 0; - default_streams[i].device = i; - default_streams[i].stream = DEFAULT_STREAM; - } - } - - // Init front-end to ensure initialization only occurs once - static void initCUDAStreamsOnce() { - // Inits default streams (once, globally) - std::call_once(init_flag, initDefaultCUDAStreams); - - // Inits current streams (thread local) to default streams - if (current_streams) return; - current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*)); - for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { - current_streams[i] = &default_streams[i]; - } - } - - /* - * Pointer-based stream API - */ - - // Helper to return the current device - static inline int64_t current_device() { - int cur_device; - DynamicCUDAInterface::get_device(&cur_device); - return cur_device; - } - - // Helper to verify the GPU index is valid - static inline void check_gpu(int64_t device) { - AT_ASSERT(device >= 0 && device < num_gpus); - } - - CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) { - initCUDAStreamsOnce(); - check_gpu(device); - return &default_streams[device]; - } - CUDAStreamInternals* CUDAStream_getDefaultStream() { - return CUDAStream_getDefaultStreamOnDevice(current_device()); - } - - // Creates (and retains) and new cuda stream - CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) { - CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals)); - internals->is_destructible = true; - internals->refcount = 1; - internals->device = current_device(); - DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority); - return internals; - } - - // Note: despite not being "unsafe," is using these methods in a multithreaded - // environment then the caller must be sure that streams are valid - // when they're requested. These methods will throw an error if an - // invalid stream is requested. - CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) { - initCUDAStreamsOnce(); - check_gpu(device); - auto cur = current_streams[device]; - AT_ASSERT(CUDAStream_retain(cur)); - return cur; - } - CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() { - return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device()); - } - - // Note: these unsafe methods do not retain the stream before returning it. - // This is unsafe behavior and these methods SHOULD NOT BE USED. - // They are here only for legacy compatibility. - CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) { - initCUDAStreamsOnce(); - check_gpu(device); - return current_streams[device]; - } - CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() { - return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device()); - } - - void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { - initCUDAStreamsOnce(); - check_gpu(device); - AT_ASSERT(ptr); - AT_ASSERT(ptr->device == device); - AT_ASSERT(CUDAStream_retain(ptr)); - - CUDAStream_free(current_streams[device]); - current_streams[device] = ptr; - } - - void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { - initCUDAStreamsOnce(); - CUDAStream_uncheckedFree(current_streams[device]); - current_streams[device] = ptr; - } - - void CUDAStream_setStream(CUDAStreamInternals* ptr) { - CUDAStream_setStreamOnDevice(current_device(), ptr); - } - - // Getters - cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) { - AT_ASSERT(ptr); - return ptr->stream; - } - - int64_t CUDAStream_device(CUDAStreamInternals* ptr) { - AT_ASSERT(ptr); - return ptr->device; - } - - // Memory management - // Note: only destructible (non-default) streams are ref counted - bool CUDAStream_retain(CUDAStreamInternals* ptr) { - AT_ASSERT(ptr); - if (ptr->is_destructible) return(++ptr->refcount > 1); - return true; - } - - void CUDAStream_free(CUDAStreamInternals*& ptr) { - if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { - AT_ASSERT(ptr->refcount == 0); - DynamicCUDAInterface::cuda_stream_destroy(ptr->stream); - free(ptr); - ptr = nullptr; - } - } - void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) { - if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { - DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream); - free(ptr); - ptr = nullptr; - } - } - -} // namespace detail - - /* - * CUDAStream functions - */ - - // Copy constructor - CUDAStream::CUDAStream(const CUDAStream& other) { - AT_ASSERT(other.internals_); - AT_ASSERT(detail::CUDAStream_retain(other.internals_)); - - internals_ = other.internals_; - } - - // Move constructor - CUDAStream::CUDAStream(CUDAStream&& other) { - AT_ASSERT(other.internals_); - - std::swap(internals_, other.internals_); - } - -} // namespace at diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h index c5fc8111e13be..e69de29bb2d1d 100644 --- a/aten/src/ATen/CUDAStream.h +++ b/aten/src/ATen/CUDAStream.h @@ -1,109 +0,0 @@ -#pragma once - -#include -#include - -/* -* A CUDA stream interface with no CUDA build dependency. -* -* Includes the CUDAStream RAII class and a pointer-based stream API. -* -* The ATen Context interface should be preferred when working with streams. -*/ - -// Forward-declares cudaStream_t to avoid depending on CUDA in CPU builds -// Note: this is the internal CUDA runtime typedef for cudaStream_t -struct CUstream_st; -typedef struct CUstream_st* cudaStream_t; - -// Forward-declares internals -struct CUDAStreamInternals; - -namespace at { - -namespace detail { - -// Pointer-based API (for internal use) -// Note: ATen/Context is preferred to work with streams safely -CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device); -CUDAStreamInternals* CUDAStream_getDefaultStream(); - -CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority); - -CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device); -CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream(); - -// Note: these Unsafe gets should NEVER be used and are only here for legacy -// purposes. Once those uses are gone they should be removed. -CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); -CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); - -void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); -void CUDAStream_uncheckedSetStreamOnDevice( - int64_t device, - CUDAStreamInternals* internals); -void CUDAStream_setStream(CUDAStreamInternals* internals); - -cudaStream_t CUDAStream_stream(CUDAStreamInternals*); -int64_t CUDAStream_device(CUDAStreamInternals*); - -bool CUDAStream_retain(CUDAStreamInternals*); -void CUDAStream_free(CUDAStreamInternals*&); -void CUDAStream_uncheckedFree(CUDAStreamInternals*&); - -} // namespace detail - -// RAII for a CUDA stream -// Allows use as a cudaStream_t, copying, moving, and metadata access. -struct CUDAStream { - // Constants - static constexpr int32_t DEFAULT_FLAGS = 1; // = cudaStreamNonBlocking; - static constexpr int32_t DEFAULT_PRIORITY = 0; - - // Constructors - CUDAStream() = default; - /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false) - : internals_{internals} { - if (retain) { - detail::CUDAStream_retain(internals_); - } - } - - // Destructor - ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); } - - // Copy constructor - CUDAStream(const CUDAStream& other); - - // Move constructor - CUDAStream(CUDAStream&& other); - - // Assignment operator - CUDAStream& operator=(CUDAStream other) noexcept { - std::swap(internals_, other.internals_); - return *this; - } - - // Returns true if the CUDAStream is not null. - explicit operator bool() const noexcept { - return internals_ != nullptr; - } - - // Implicit conversion to cudaStream_t - operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); } - - // Less than operator (to allow use in sets) - friend bool operator<(const CUDAStream& left, const CUDAStream& right) { - return left.internals_ < right.internals_; - } - - // Getters - int64_t device() const { return detail::CUDAStream_device(internals_); } - cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); } - CUDAStreamInternals* internals() const { return internals_; } - -private: - CUDAStreamInternals* internals_ = nullptr; -}; - -} // namespace at diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 9a9125ccedfe0..969c0dcc6c037 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -79,58 +79,6 @@ class AT_API Context { return thc_state.get(); } - CUDAStream createCUDAStream() const { - return detail::CUDAStream_createAndRetainWithOptions( - CUDAStream::DEFAULT_FLAGS - , CUDAStream::DEFAULT_PRIORITY - ); - } - - CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) const { - return detail::CUDAStream_createAndRetainWithOptions(flags, priority); - } - - CUDAStream getDefaultCUDAStream() const { - return detail::CUDAStream_getDefaultStream(); - } - - CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) const { - return detail::CUDAStream_getDefaultStreamOnDevice(device); - } - - CUDAStream getCurrentCUDAStream() const { - return detail::CUDAStream_getAndRetainCurrentStream(); - } - - CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) const { - return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device); - } - - void setCurrentCUDAStream(CUDAStream stream) const { - return detail::CUDAStream_setStream(stream.internals()); - } - - void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) const { - return detail::CUDAStream_setStreamOnDevice(device, stream.internals()); - } - - void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) - const { - return detail::CUDAStream_uncheckedSetStreamOnDevice( - device, stream.internals()); - } - -#ifndef __HIP_PLATFORM_HCC__ - cusparseHandle_t getCurrentCUDASparseHandle() const { - return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get()); - } -#endif - cudaDeviceProp* getCurrentDeviceProperties() const { - return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get()); - } - cudaDeviceProp* getDeviceProperties(int device) const { - return detail::getCUDAHooks().getDeviceProperties(thc_state.get(), device); - } int getNumGPUs() const { return detail::getCUDAHooks().getNumGPUs(); } diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index e34cd14548451..a30bed575d3d3 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -3,6 +3,7 @@ #include "detail/IndexUtils.cuh" #include "ATen/TensorUtils.h" #include "THC/THCAtomics.cuh" +#include "ATen/cuda/CUDAContext.h" // // This file contains pointwise operation functions and kernels that @@ -239,7 +240,7 @@ __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) { inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) { if (curDevice == -1) return false; uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast(AT_APPLY_THREADS_PER_BLOCK)); - uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0]; + uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0]; if (numBlocks > maxGridX) numBlocks = maxGridX; grid = dim3(numBlocks); @@ -322,7 +323,7 @@ bool CUDA_tensor_apply2(at::Tensor a, scalar1, \ scalar2, \ TYPE, A, B> \ - <<>>( \ + <<>>( \ aInfo, bInfo, (TYPE) totalElements, op); #define HANDLE_B_CASE(TYPE, A, B) { \ @@ -365,7 +366,7 @@ bool CUDA_tensor_apply2(at::Tensor a, bInfo.collapseDims(); #if CUDA_VERSION < 9000 if (!(aInfo.isContiguous() && bInfo.isContiguous())) - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); @@ -388,17 +389,17 @@ bool CUDA_tensor_apply2(at::Tensor a, scalar1, scalar2, uint64_t, 1, 1> - <<>>( + <<>>( aInfo, bInfo, (uint64_t) totalElements, op); } else { #if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif kernelPointwiseApply2 - <<>>( + <<>>( aInfo, bInfo, (uint64_t) totalElements, op); } } @@ -502,7 +503,7 @@ bool CUDA_tensor_apply3(at::Tensor a, scalar2, \ scalar3, \ TYPE, A, B, C> \ - <<>>( \ + <<>>( \ aInfo, bInfo, cInfo, (TYPE) totalElements, op); #define HANDLE_C_CASE(TYPE, A, B, C) { \ @@ -566,7 +567,7 @@ bool CUDA_tensor_apply3(at::Tensor a, #if CUDA_VERSION < 9000 if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous())) - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); } else { @@ -594,11 +595,11 @@ bool CUDA_tensor_apply3(at::Tensor a, scalar2, scalar3, uint64_t, 1, 1, 1> - <<>>( + <<>>( aInfo, bInfo, cInfo, (uint64_t) totalElements, op); } else { #if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif kernelPointwiseApply3 - <<>>( + <<>>( aInfo, bInfo, cInfo, (uint64_t) totalElements, op); } } @@ -730,7 +731,7 @@ bool CUDA_tensor_apply4(at::Tensor a, scalar3, \ scalar4, \ TYPE, A, B, C, D> \ - <<>>( \ + <<>>( \ aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op); #define HANDLE_D_CASE(TYPE, A, B, C, D) { \ @@ -813,7 +814,7 @@ bool CUDA_tensor_apply4(at::Tensor a, #if CUDA_VERSION < 9000 if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous())) - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims); } else { @@ -846,11 +847,11 @@ bool CUDA_tensor_apply4(at::Tensor a, scalar3, scalar4, uint64_t, 1, 1, 1, 1> - <<>>( + <<>>( aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op); } else { #if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); + grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); #endif kernelPointwiseApply4 - <<>>( + <<>>( aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op); } } diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp new file mode 100644 index 0000000000000..794cbb44ddd81 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAContext.cpp @@ -0,0 +1,76 @@ +#include "ATen/cuda/CUDAContext.h" +#include "THC/THCGeneral.h" + +namespace at { namespace cuda { + +/* Device info */ +int64_t getNumGPUs() { + int count; + AT_CUDA_CHECK(cudaGetDeviceCount(&count)); + return count; +} + +int64_t current_device() { + int cur_device; + AT_CUDA_CHECK(cudaGetDevice(&cur_device)); + return cur_device; +} + +cudaDeviceProp* getCurrentDeviceProperties() { + return THCState_getCurrentDeviceProperties(at::globalContext().getTHCState()); +} + +cudaDeviceProp* getDeviceProperties(int64_t device) { + return THCState_getDeviceProperties(at::globalContext().getTHCState(), (int)device); +} + +/* Streams */ +CUDAStream createCUDAStream() { + return detail::CUDAStream_createAndRetainWithOptions( + CUDAStream::DEFAULT_FLAGS + , CUDAStream::DEFAULT_PRIORITY + ); +} + +CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) { + return detail::CUDAStream_createAndRetainWithOptions(flags, priority); +} + +CUDAStream getDefaultCUDAStream() { + return detail::CUDAStream_getDefaultStream(); +} + +CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) { + return detail::CUDAStream_getDefaultStreamOnDevice(device); +} + +CUDAStream getCurrentCUDAStream() { + return detail::CUDAStream_getAndRetainCurrentStream(); +} + +CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) { + return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device); +} + +void setCurrentCUDAStream(CUDAStream stream) { + return detail::CUDAStream_setStream(stream.internals()); +} + +void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) { + return detail::CUDAStream_setStreamOnDevice(device, stream.internals()); +} + +void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) { + return detail::CUDAStream_uncheckedSetStreamOnDevice(device, stream.internals()); +} + +/* Handles */ +#ifndef __HIP_PLATFORM_HCC__ + cusparseHandle_t getCurrentCUDASparseHandle() { + return THCState_getCurrentSparseHandle(at::globalContext().getTHCState()); + } +#endif + +} // namespace cuda + +} // namespace at \ No newline at end of file diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h new file mode 100644 index 0000000000000..b8fbe67b1ce20 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAContext.h @@ -0,0 +1,70 @@ +#pragma once + +#include "ATen/Context.h" +#include "ATen/cuda/CUDAStream.h" +#include "ATen/cuda/Exceptions.h" +#include "ATen/ATenGeneral.h" + +#include + +#include "cuda_runtime_api.h" +#include "cusparse.h" + +namespace at { namespace cuda { + +/* +A common CUDA interface for ATen. + +This interface is distinct from CUDAHooks, which defines an interface that links +to both CPU-only and CUDA builds. That interface is intended for runtime dispatch +and should be used from files that are included in both CPU-only and CUDA builds. + +CUDAContext, on the other hand, should be preferred by files only included in +CUDA builds. It is intended to expose CUDA functionality in a consistent +manner. + +This means there is some overlap between the CUDAContext and CUDAHooks, but +the choice of which to use is simple: use CUDAContext when in a CUDA-only file, +use CUDAHooks otherwise. + +Note that CUDAContext simply defines an interface with no associated class. +It is expected that the modules whose functions compose this interface will +manage their own state. There is only a single CUDA context/state. +*/ + +/* Device info */ +AT_API int64_t getNumGPUs(); + +AT_API int64_t current_device(); + +AT_API cudaDeviceProp* getCurrentDeviceProperties(); + +AT_API cudaDeviceProp* getDeviceProperties(int64_t device); + +/* Streams */ +AT_API CUDAStream createCUDAStream(); + +AT_API CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority); + +AT_API CUDAStream getDefaultCUDAStream(); + +AT_API CUDAStream getDefaultCUDAStreamOnDevice(int64_t device); + +AT_API CUDAStream getCurrentCUDAStream(); + +AT_API CUDAStream getCurrentCUDAStreamOnDevice(int64_t device); + +AT_API void setCurrentCUDAStream(CUDAStream stream); + +AT_API void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream); + +AT_API void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream); + +/* Handles */ +#ifndef __HIP_PLATFORM_HCC__ + AT_API cusparseHandle_t getCurrentCUDASparseHandle(); +#endif + + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAGuard.h b/aten/src/ATen/cuda/CUDAGuard.h new file mode 100644 index 0000000000000..db006625221cf --- /dev/null +++ b/aten/src/ATen/cuda/CUDAGuard.h @@ -0,0 +1,108 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace at { namespace cuda { + +/// A variant of `DeviceGuard` that augments it with an understanding of CUDA +/// streams. This guard can not only set and reset the current CUDA device, but +/// also set and reset the current CUDA stream. It is important to note that +/// because a CUDA stream is intrinsically associated with the CUDA device to +/// which it is bound, setting the CUDA stream *also* sets the current CUDA +/// device to that of the stream. +struct CUDAGuard { + /// Default constructor, does nothing and causes no change in the current + /// stream or device until `set_stream` or `set_device` is called. + CUDAGuard() = default; + + /// Sets the CUDA stream and its associated device as the current one (calls + /// `set_stream`). + explicit CUDAGuard(const CUDAStream& stream) { + set_stream(stream); + } + + /// Calls `set_device` with the given index. + explicit CUDAGuard(int32_t device) { + set_device(device); + } + + CUDAGuard(const CUDAGuard&) = delete; + CUDAGuard& operator=(const CUDAGuard&) = delete; + + /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The + /// moved-from `CUDAGuard` is modified such that its destruction has no + /// effect (does not reset the stream or device). + CUDAGuard(CUDAGuard&& other) noexcept = default; + + /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The + /// moved-from `CUDAGuard` is modified such that its destruction has no + /// effect (does not reset the stream or device). + CUDAGuard& operator=(CUDAGuard&& other) { + device_guard_ = std::move(other.device_guard_); + original_streams_ = std::move(other.original_streams_); + other.original_streams_.clear(); + return *this; + } + + /// Resets the CUDA stream on each device to the one that was active upon + /// construction. + ~CUDAGuard() { + if (!original_streams_.empty()) { + for (size_t device = 0; device < original_streams_.size(); ++device) { + uncheckedSetCurrentCUDAStreamOnDevice(device, original_streams_[device]); + } + } + } + + /// Sets the current CUDA device to the device associated with the given + /// stream, and then sets the current stream on that device to the one given. + void set_stream(const CUDAStream& stream) { + device_guard_.set_index(stream.device()); + // If we haven't stored the current stream yet, store it now. + if (original_streams_.empty()) { + const size_t device_count = getNumGPUs(); + original_streams_.reserve(device_count); + for (size_t device = 0; device < device_count; ++device) { + original_streams_.push_back(getCurrentCUDAStreamOnDevice(device)); + } + } + setCurrentCUDAStreamOnDevice(device_guard_.last_index(), stream); + } + + /// Sets the CUDA device to the given one. + void set_device(int32_t device) { + device_guard_.set_index(device); + } + + /// Returns the CUDA streams that were active in the first call to + /// `set_stream`. If there was no such call, the returned container is + /// empty. + ArrayRef original_streams() const noexcept { + return original_streams_; + } + + /// Returns the device that was set upon construction of the guard. + int32_t original_device() const noexcept { + return device_guard_.original_index(); + } + + /// Returns the last device that was set via `set_device`, if any. + int32_t last_device() const noexcept { + return device_guard_.last_index(); + } + + private: + /// The guard for the current device. + at::DeviceGuard device_guard_; + /// The original streams that were active on all devices. + std::vector original_streams_; +}; + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp new file mode 100644 index 0000000000000..4fb945a9920f8 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAStream.cpp @@ -0,0 +1,198 @@ +#include "ATen/cuda/CUDAStream.h" +#include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/Exceptions.h" +#include "ATen/Error.h" + +#include +#include + +// Internal implementation is entirely hidden +struct CUDAStreamInternals { + bool is_destructible; + std::atomic refcount; + int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t + cudaStream_t stream; +}; + +namespace at { +namespace cuda { + +namespace detail { + + /* + * Stream state + */ + static constexpr cudaStream_t DEFAULT_STREAM = 0; + + static std::once_flag init_flag; + static int64_t num_gpus; + static CUDAStreamInternals* default_streams; + static thread_local CUDAStreamInternals** current_streams = nullptr; + + // Creates a(n indestructible) default stream for each device + // Note: the default stream on each device is signified by a zero + // value for the pointer, and so is not actually created as usual. + // In particular, we don't need to switch devices when creating the + // streams. + static void initDefaultCUDAStreams() { + num_gpus = getNumGPUs(); + default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals)); + for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { + default_streams[i].is_destructible = false; + default_streams[i].refcount = 0; + default_streams[i].device = i; + default_streams[i].stream = DEFAULT_STREAM; + } + } + + // Init front-end to ensure initialization only occurs once + static void initCUDAStreamsOnce() { + // Inits default streams (once, globally) + std::call_once(init_flag, initDefaultCUDAStreams); + + // Inits current streams (thread local) to default streams + if (current_streams) return; + current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*)); + for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { + current_streams[i] = &default_streams[i]; + } + } + + /* + * Pointer-based stream API + */ + + // Helper to verify the GPU index is valid + static inline void check_gpu(int64_t device) { + AT_ASSERT(device >= 0 && device < num_gpus); + } + + CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + return &default_streams[device]; + } + CUDAStreamInternals* CUDAStream_getDefaultStream() { + return CUDAStream_getDefaultStreamOnDevice(current_device()); + } + + // Creates (and retains) and new cuda stream + CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) { + CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals)); + internals->is_destructible = true; + internals->refcount = 1; + internals->device = current_device(); + #ifndef __HIP_PLATFORM_HCC__ + AT_CUDA_CHECK(cudaStreamCreateWithPriority(&internals->stream, flags, priority)); + #else + AT_CUDA_CHECK(cudaStreamCreateWithFlags(&internals->stream, flags)); + #endif // __HIP_PLATFORM_HCC__ + return internals; + } + + // Note: despite not being "unsafe," is using these methods in a multithreaded + // environment then the caller must be sure that streams are valid + // when they're requested. These methods will throw an error if an + // invalid stream is requested. + CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + auto cur = current_streams[device]; + AT_ASSERT(CUDAStream_retain(cur)); + return cur; + } + CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() { + return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device()); + } + + // Note: these unsafe methods do not retain the stream before returning it. + // This is unsafe behavior and these methods SHOULD NOT BE USED. + // They are here only for legacy compatibility. + CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) { + initCUDAStreamsOnce(); + check_gpu(device); + return current_streams[device]; + } + CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() { + return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device()); + } + + void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { + initCUDAStreamsOnce(); + check_gpu(device); + AT_ASSERT(ptr); + AT_ASSERT(ptr->device == device); + AT_ASSERT(CUDAStream_retain(ptr)); + + CUDAStream_free(current_streams[device]); + current_streams[device] = ptr; + } + + void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { + initCUDAStreamsOnce(); + CUDAStream_uncheckedFree(current_streams[device]); + current_streams[device] = ptr; + } + + void CUDAStream_setStream(CUDAStreamInternals* ptr) { + CUDAStream_setStreamOnDevice(current_device(), ptr); + } + + // Getters + cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) { + AT_ASSERT(ptr); + return ptr->stream; + } + + int64_t CUDAStream_device(CUDAStreamInternals* ptr) { + AT_ASSERT(ptr); + return ptr->device; + } + + // Memory management + // Note: only destructible (non-default) streams are ref counted + bool CUDAStream_retain(CUDAStreamInternals* ptr) { + AT_ASSERT(ptr); + if (ptr->is_destructible) return(++ptr->refcount > 1); + return true; + } + + void CUDAStream_free(CUDAStreamInternals*& ptr) { + if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { + AT_ASSERT(ptr->refcount == 0); + AT_CUDA_CHECK(cudaStreamDestroy(ptr->stream)); + free(ptr); + ptr = nullptr; + } + } + void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) { + if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { + cudaStreamDestroy(ptr->stream); + free(ptr); + ptr = nullptr; + } + } + +} // namespace detail + + /* + * CUDAStream functions + */ + + // Copy constructor + CUDAStream::CUDAStream(const CUDAStream& other) { + AT_ASSERT(other.internals_); + AT_ASSERT(detail::CUDAStream_retain(other.internals_)); + + internals_ = other.internals_; + } + + // Move constructor + CUDAStream::CUDAStream(CUDAStream&& other) { + AT_ASSERT(other.internals_); + + std::swap(internals_, other.internals_); + } + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h new file mode 100644 index 0000000000000..72fc752b717a4 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAStream.h @@ -0,0 +1,108 @@ +#pragma once + +#include +#include + +#include "cuda_runtime_api.h" + +/* +* A CUDA stream interface with no CUDA build dependency. +* +* Includes the CUDAStream RAII class and a pointer-based stream API. +* +* The ATen Context interface should be preferred when working with streams. +*/ + +// Forward-declares internals +struct CUDAStreamInternals; + +namespace at { +namespace cuda { + +namespace detail { + +// Pointer-based API (for internal use) +// Note: ATen/Context is preferred to work with streams safely +CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device); +CUDAStreamInternals* CUDAStream_getDefaultStream(); + +CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority); + +CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device); +CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream(); + +// Note: these Unsafe gets should NEVER be used and are only here for legacy +// purposes. Once those uses are gone they should be removed. +CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); +CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); + +void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); +void CUDAStream_uncheckedSetStreamOnDevice( + int64_t device, + CUDAStreamInternals* internals); +void CUDAStream_setStream(CUDAStreamInternals* internals); + +cudaStream_t CUDAStream_stream(CUDAStreamInternals*); +int64_t CUDAStream_device(CUDAStreamInternals*); + +bool CUDAStream_retain(CUDAStreamInternals*); +void CUDAStream_free(CUDAStreamInternals*&); +void CUDAStream_uncheckedFree(CUDAStreamInternals*&); + +} // namespace detail + +// RAII for a CUDA stream +// Allows use as a cudaStream_t, copying, moving, and metadata access. +struct CUDAStream { + // Constants + static constexpr int32_t DEFAULT_FLAGS = cudaStreamNonBlocking; + static constexpr int32_t DEFAULT_PRIORITY = 0; + + // Constructors + CUDAStream() = default; + /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false) + : internals_{internals} { + if (retain) { + detail::CUDAStream_retain(internals_); + } + } + + // Destructor + ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); } + + // Copy constructor + CUDAStream(const CUDAStream& other); + + // Move constructor + CUDAStream(CUDAStream&& other); + + // Assignment operator + CUDAStream& operator=(CUDAStream other) noexcept { + std::swap(internals_, other.internals_); + return *this; + } + + // Returns true if the CUDAStream is not null. + explicit operator bool() const noexcept { + return internals_ != nullptr; + } + + // Implicit conversion to cudaStream_t + operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); } + + // Less than operator (to allow use in sets) + friend bool operator<(const CUDAStream& left, const CUDAStream& right) { + return left.internals_ < right.internals_; + } + + // Getters + int64_t device() const { return detail::CUDAStream_device(internals_); } + cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); } + CUDAStreamInternals* internals() const { return internals_; } + +private: + CUDAStreamInternals* internals_ = nullptr; +}; + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h new file mode 100644 index 0000000000000..dbacf35079272 --- /dev/null +++ b/aten/src/ATen/cuda/Exceptions.h @@ -0,0 +1,20 @@ +#pragma once + +#include "ATen/Error.h" + +#define AT_CUDNN_CHECK(STATUS) \ + if (STATUS != CUDNN_STATUS_SUCCESS) { \ + if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) { \ + AT_ERROR( \ + "CuDNN error: ", \ + cudnnGetErrorString(STATUS), \ + ". This error may appear if you passed in a non-contiguous input."); \ + } else { \ + AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS)); \ + } \ + } + +#define AT_CUDA_CHECK(STATUS) \ + if (STATUS != cudaSuccess) { \ + AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \ + } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 5872764a905ce..09da44f342b69 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -49,37 +49,12 @@ void unchecked_set_device(int32_t device) { (void)return_code; } -void cuda_stream_create_with_priority( - cudaStream_t* pStream -, int32_t flags -, int32_t priority) { -#ifndef __HIP_PLATFORM_HCC__ - check_status(cudaStreamCreateWithPriority(pStream, flags, priority)); -#else - check_status(cudaStreamCreateWithFlags(pStream, flags)); -#endif -} - -void cuda_stream_destroy(cudaStream_t stream) { - check_status(cudaStreamDestroy(stream)); -} - -void unchecked_cuda_stream_destroy(cudaStream_t stream) { - const auto return_code = cudaStreamDestroy(stream); - (void)return_code; -} - struct DynamicCUDAInterfaceSetter { DynamicCUDAInterfaceSetter() { using at::detail::DynamicCUDAInterface; DynamicCUDAInterface::set_device = set_device; DynamicCUDAInterface::get_device = get_device; DynamicCUDAInterface::unchecked_set_device = unchecked_set_device; - DynamicCUDAInterface::cuda_stream_create_with_priority = - cuda_stream_create_with_priority; - DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy; - DynamicCUDAInterface::unchecked_cuda_stream_destroy = - unchecked_cuda_stream_destroy; } }; @@ -123,21 +98,6 @@ bool CUDAHooks::hasCuDNN() const { return AT_CUDNN_ENABLED(); } -#ifndef __HIP_PLATFORM_HCC__ -cusparseHandle_t CUDAHooks::getCurrentCUDASparseHandle(THCState* thc_state) const { - return THCState_getCurrentSparseHandle(thc_state); -} -#endif -struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties( - THCState* thc_state) const { - return THCState_getCurrentDeviceProperties(thc_state); -} -struct cudaDeviceProp* CUDAHooks::getDeviceProperties( - THCState* thc_state, - int device) const { - return THCState_getDeviceProperties(thc_state, device); -} - int64_t CUDAHooks::current_device() const { int device; cudaError_t err = cudaGetDevice(&device); @@ -162,7 +122,7 @@ bool CUDAHooks::compiledWithCuDNN() const { bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() cudaDeviceProp* prop = - getCurrentDeviceProperties(globalContext().getTHCState()); + THCState_getCurrentDeviceProperties(globalContext().getTHCState()); // NOTE: extra parenthesis around numbers disable clang warnings about // dead code return ( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index d88ac0d334d5d..eae1a802a5cb0 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -14,11 +14,6 @@ struct CUDAHooks : public at::CUDAHooksInterface { std::unique_ptr initCUDAGenerator(Context*) const override; bool hasCUDA() const override; bool hasCuDNN() const override; -#ifndef __HIP_PLATFORM_HCC__ - cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const override; -#endif - struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override; - struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override; int64_t current_device() const override; Allocator* getPinnedMemoryAllocator() const override; void registerCUDATypes(Context*) const override; diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 2bf7f0ae9bc42..085f2723bf045 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -1,6 +1,7 @@ #pragma once -#include "Exceptions.h" +#include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/Exceptions.h" #include "cudnn-wrapper.h" #include @@ -304,7 +305,7 @@ struct AT_CUDA_API RNNDescriptor CUDNN_RNN_ALGO_STANDARD, datatype)); #if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000 - cudaDeviceProp* prop = globalContext().getCurrentDeviceProperties(); + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 7) { if (datatype == CUDNN_DATA_HALF) { cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH); diff --git a/aten/src/ATen/cudnn/Exceptions.h b/aten/src/ATen/cudnn/Exceptions.h index b59127e64a690..e69de29bb2d1d 100644 --- a/aten/src/ATen/cudnn/Exceptions.h +++ b/aten/src/ATen/cudnn/Exceptions.h @@ -1,17 +0,0 @@ -#pragma once -#include -#define AT_CUDNN_CHECK(STATUS) \ - if (STATUS != CUDNN_STATUS_SUCCESS) { \ - if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) { \ - AT_ERROR( \ - "CuDNN error: ", \ - cudnnGetErrorString(STATUS), \ - ". This error may appear if you passed in a non-contiguous input."); \ - } else { \ - AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS)); \ - } \ - } -#define AT_CUDA_CHECK(STATUS) \ - if (STATUS != cudaSuccess) { \ - AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \ - } diff --git a/aten/src/ATen/cudnn/Handles.cpp b/aten/src/ATen/cudnn/Handles.cpp index 7aae86d18b0b8..4848d2aca2412 100644 --- a/aten/src/ATen/cudnn/Handles.cpp +++ b/aten/src/ATen/cudnn/Handles.cpp @@ -1,6 +1,6 @@ #include "Handles.h" -#include "Exceptions.h" +#include "ATen/cuda/Exceptions.h" #include #include diff --git a/aten/src/ATen/cudnn/Utils.h b/aten/src/ATen/cudnn/Utils.h index c2e5dcbce70e9..264bf580f8a4d 100644 --- a/aten/src/ATen/cudnn/Utils.h +++ b/aten/src/ATen/cudnn/Utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include "ATen/cuda/Exceptions.h" #include "THC/THC.h" #include "cudnn-wrapper.h" #include "Handles.h" diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp index 288b066feafeb..b6c31bed6eda4 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.cpp +++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp @@ -27,37 +27,11 @@ void default_unchecked_set_device(int32_t) { "before CUDA library was loaded"); } -void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) { - AT_ERROR( - "DynamicCUDAInterface::cuda_stream_create_with_priority called " - "before CUDA library was loaded"); -} - -void default_cuda_stream_destroy(cudaStream_t) { - AT_ERROR( - "DynamicCUDAInterface::cuda_stream_destroy called " - "before CUDA library was loaded"); -} - -void default_unchecked_cuda_stream_destroy(cudaStream_t) { - AT_ERROR( - "DynamicCUDAInterface::unchecked_cuda_stream_destroy called " - "before CUDA library was loaded"); -} - // Default the static members of DynamicCUDAInterface. void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device; void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device; void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) = default_unchecked_set_device; -void (*DynamicCUDAInterface::cuda_stream_create_with_priority)( - cudaStream_t*, - int32_t, - int32_t) = default_cuda_stream_create_with_priority; -void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) = - default_cuda_stream_destroy; -void (*DynamicCUDAInterface::unchecked_cuda_stream_destroy)(cudaStream_t) = - default_unchecked_cuda_stream_destroy; const CUDAHooksInterface& getCUDAHooks() { static std::unique_ptr cuda_hooks; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index f0596d01949d8..401cbc76e5471 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -9,28 +9,8 @@ #include #include -// Forward declare these CUDA types here to avoid including CUDA headers in -// ATen headers, which would make ATen always require CUDA to build. +// Forward-declares THCState struct THCState; -struct cudaDeviceProp; -struct CUstream_st; -typedef struct CUstream_st* cudaStream_t; - -#ifndef __HIP_PLATFORM_HCC__ -// pyHIPIFY rewrites this as: -// -// struct cusparseContext; -// typedef struct cusparseContext *hipsparseHandle_t; -// -// however, this forward declaration is wrong -// the way that the HIP headers define hipsparseHandle_t is -// -// typedef cusparseHandle_t hipsparseHandle_t -// -// so the rewrite is wrong. -struct cusparseContext; -typedef struct cusparseContext *cusparseHandle_t; -#endif namespace at { class Context; @@ -89,21 +69,6 @@ struct AT_API CUDAHooksInterface { return false; } -#ifndef __HIP_PLATFORM_HCC__ - virtual cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const { - AT_ERROR("Cannot getCurrentCUDASparseHandle() without ATen_cuda library. ", CUDA_HELP); - } -#endif - - virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const { - AT_ERROR("Cannot getCurrentDeviceProperties() without ATen_cuda library. ", CUDA_HELP); - } - - virtual struct cudaDeviceProp* getDeviceProperties(THCState*, int device) - const { - AT_ERROR("Cannot getDeviceProperties() without ATen_cuda library. ", CUDA_HELP); - } - virtual int64_t current_device() const { return -1; } @@ -175,9 +140,6 @@ struct AT_API DynamicCUDAInterface { static void (*set_device)(int32_t); static void (*get_device)(int32_t*); static void (*unchecked_set_device)(int32_t); - static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t); - static void (*cuda_stream_destroy)(cudaStream_t); - static void (*unchecked_cuda_stream_destroy)(cudaStream_t); }; } // namespace detail } // namespace at diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index 49c56cbf6e02c..fdc95e546005c 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -1,4 +1,5 @@ #include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/Config.h" #include "ATen/native/cuda/CuFFTUtils.h" #include "ATen/native/utils/ParamsHash.h" @@ -105,7 +106,7 @@ class CuFFTConfig { // contiguous, only need to check real-to-complex case. if (input.type().scalarType() == ScalarType::Half) { // cuFFT on half requires compute capability of at least SM_53 - auto dev_prop = at::globalContext().getCurrentDeviceProperties(); + auto dev_prop = at::cuda::getCurrentDeviceProperties(); if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) { std::ostringstream ss; ss << "cuFFT doesn't support signals of half type with compute " diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index affe20d71c791..e3625762d6c25 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -1,7 +1,7 @@ #include "ATen/ATen.h" #include "ATen/TensorUtils.h" #include "ATen/Error.h" - +#include "ATen/cuda/CUDAContext.h" #include "ATen/AccumulateType.h" #include @@ -217,7 +217,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options()); int64_t stride = grad_weight.stride(0); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (num_indices <= 768 && !scale_grad_by_freq) { auto indices_contig = indices.contiguous(); @@ -332,7 +332,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, checkDim("embedding_renorm_", self_arg, 2); checkSameGPU("embedding_renorm", self_arg, indices_arg); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); auto policy = thrust::cuda::par(allocator).on(stream); diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 9169cb0375c55..4f989e3db5c56 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -1,4 +1,5 @@ #include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/TensorUtils.h" #include "ATen/NativeFunctions.h" @@ -176,7 +177,7 @@ Tensor embedding_bag_backward_cuda_sum_avg( auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type()); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); ptrdiff_t numel = indices.numel(); int64_t stride = grad_weight.stride(0); @@ -283,7 +284,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, int64_t numBags = grad.size(0); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 block = dim3(32, 8); int grid = 1024; @@ -323,7 +324,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices, auto offset2bag = at::zeros({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0] - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options()); diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index c31d5572bf8a4..cdf0f2f931a05 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -1,4 +1,5 @@ #include "ATen/Context.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/Dispatch.h" #include "ATen/NativeFunctions.h" #include "ATen/cuda/PinnedMemoryAllocator.h" @@ -51,7 +52,7 @@ static magma_queue_t createMagmaQueue(const Tensor& tensor) { magma_queue_t magma_queue; magma_queue_create_from_cuda( tensor.get_device(), - context.getCurrentCUDAStream(), + at::cuda::getCurrentCUDAStream(), THCState_getCurrentBlasHandle(context.getTHCState()), THCState_getCurrentSparseHandle(context.getTHCState()), &magma_queue); diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu index 63f1f26a979f9..ef2fba982ad10 100644 --- a/aten/src/ATen/native/cuda/RoiPooling.cu +++ b/aten/src/ATen/native/cuda/RoiPooling.cu @@ -1,6 +1,7 @@ #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" #include "ATen/Error.h" +#include "ATen/cuda/CUDAContext.h" #include #include @@ -134,7 +135,7 @@ std::tuple RoiPooling2d_forward_cuda( dim3 block(512); dim3 grid((output.numel() + 512 - 1) / 512); - RoiPooling2d_forward_kernel<<>>( + RoiPooling2d_forward_kernel<<>>( output.numel(), input.data(), rois.data(), static_cast(spatialScale), inputChannels, inputHeight, inputWidth, pooledHeight, pooledWidth, output.data(), argmaxes.data()); AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_forward_kernel failed with error code ", cudaGetLastError()); @@ -201,7 +202,7 @@ Tensor RoiPooling2d_backward_cuda( dim3 block(512); dim3 grid((gradInput.numel() + 512 - 1) / 512); - RoiPooling2d_backward_kernel<<>>( + RoiPooling2d_backward_kernel<<>>( gradOutput.numel(), gradOutput.data(), argmaxes.data(), proposals, static_cast(spatialScale), inputChannels, inputHeight, inputWidth, pooledHeight, pooledWidth, gradInput.data(), rois.data()); diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index 0ee5d18d1e2bd..8599d17611bd7 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -1,4 +1,5 @@ #include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/TensorUtils.h" #include "ATen/NativeFunctions.h" #include "ATen/WrapDimUtils.h" @@ -124,7 +125,7 @@ void SpatialSoftMax_getLaunchSizes( cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, k, block_threads, smem_size); #endif - max_active_blocks *= at::globalContext().getCurrentDeviceProperties()->multiProcessorCount; + max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount; grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); } @@ -483,7 +484,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_){ int64_t outer_size = 1; int64_t dim_size = input.size(dim); int64_t inner_size = 1; - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); for (int64_t i = 0; i < dim; ++i) outer_size *= input.size(i); for (int64_t i = dim + 1; i < input.dim(); ++i) @@ -540,7 +541,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t for (int64_t i = dim + 1; i < output.dim(); ++i) inner_size *= output.size(i); // See descriptions of kernels above. - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (inner_size == 1) { const int ILP = 2; dim3 grid(outer_size); diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 7266ebdd998d5..c2ad676c56329 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -1,4 +1,5 @@ #include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/Config.h" #include "ATen/Dispatch.h" #include "ATen/Utils.h" @@ -104,7 +105,7 @@ static void _fft_fill_with_conjugate_symmetry_(Tensor& input, // copy int64_t n = input.numel() / size_last_dim * (size_last_dim - last_dim_start_slice); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); auto policy = thrust::cuda::par(allocator).on(stream); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "_fft_fill_with_conjugate_symmetry_", [&] { @@ -180,7 +181,7 @@ static inline Tensor _run_cufft( auto output = input.type().tensor(output_sizes); // set to current stream - CUFFT_CHECK(cufftSetStream(plan, ctx.getCurrentCUDAStream())); + CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream())); auto ws = ctx.getType(at::Backend::CUDA, at::ScalarType::Byte).tensor({ config.workspace_size() }); CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr())); diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index 46c812cc91617..c55fc0262a178 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -1,4 +1,5 @@ #include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" #include "ATen/cuda/CUDAApplyUtils.cuh" namespace at { @@ -116,7 +117,7 @@ __global__ void kernelHistogram1D( <<>>( \ + getCurrentCUDAStream()>>>( \ aInfo, pInfo, bInfo, binsize, totalElements, WEIGHTS_OP); \ AT_ASSERTM(cudaGetLastError() == cudaSuccess, "kernelHistogram1D failed"); @@ -184,8 +185,7 @@ bool CUDA_tensor_histogram( } CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL; - auto maxSharedMem = - at::globalContext().getCurrentDeviceProperties()->sharedMemPerBlock; + auto maxSharedMem = getCurrentDeviceProperties()->sharedMemPerBlock; auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes auto maxGlobalMem = getFreeGlobalMemory(); auto multiBlockMem = nbins * grid.x * sizeof(output_t) + 8; // 8 guard bytes diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index cc8e78c292dbc..c1c0e943fde7c 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -2,6 +2,7 @@ #include "ATen/cuda/detail/IndexUtils.cuh" #include "ATen/NativeFunctions.h" +#include "ATen/cuda/CUDAContext.h" #include #include @@ -83,7 +84,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { int flip_dim = in_tensor_info.collapseDims(dims[0]); out_tensor_info.collapseDims(dims[0]); kernel_pointwise_flip_apply2 - <<>>( + <<>>( in_tensor_info, out_tensor_info, N, flip_dim, total_dims); }); return out_tensor; @@ -113,7 +114,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { } AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] { - flip_cuda_kernel<<>>( + flip_cuda_kernel<<>>( in_tensor.data(), out_tensor.data(), N, flip_dims_t.toType(CUDA(kLong)).data(), flip_dims_size, strides_t.toType(CUDA(kLong)).data(), stride_contiguous.toType(CUDA(kLong)).data(), shape_t.toType(CUDA(kLong)).data(), total_dims); }); diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index 32dc7d35380bd..f2e13b4c708b6 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -1,5 +1,5 @@ #include "ATen/ATen.h" - +#include "ATen/cuda/CUDAContext.h" #include #include #include @@ -37,7 +37,7 @@ template const Tensor& self, const bool return_inverse) { - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); auto policy = thrust::cuda::par(allocator).on(stream); diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp index f73a2ad0dfd90..7f0a0f86524c6 100644 --- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp +++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp @@ -30,6 +30,7 @@ Tensor cudnn_affine_grid_generator_backward( #include #include #include +#include #include diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index 9b2a25670d4b3..7cd7466a28503 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -31,6 +31,7 @@ std::tuple cudnn_batch_norm_backward( #include #include #include +#include #include diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index b3ee01603b24f..a2be3e507cbd0 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #if !AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index c6b7ffc1cb6a0..1ce92cf7e18d2 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -27,6 +27,7 @@ std::tuple cudnn_grid_sampler_backward( #include #include #include +#include #include diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index aced0a0ff3ab9..63f0d7a29578f 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #if !AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu index 0ed53befb27d0..25d639f8bd828 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include @@ -60,8 +61,8 @@ inline void CUSPARSE_CHECK(cusparseStatus_t status) } inline cusparseHandle_t setCUDASparseStream() { - cusparseHandle_t handle = globalContext().getCurrentCUDASparseHandle(); - cusparseSetStream(handle, globalContext().getCurrentCUDAStream()); + cusparseHandle_t handle = at::cuda::getCurrentCUDASparseHandle(); + cusparseSetStream(handle, at::cuda::getCurrentCUDAStream()); return handle; } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index a12edc9a77423..b4617f23eb6cc 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -33,7 +34,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { return self; } - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); auto policy = thrust::cuda::par(allocator).on(stream); // Replace instances with diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 3521fc340ac00..ddeae3bd5707a 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -197,7 +198,7 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse _get_sparse_impl(r_)->raw_resize_(1, 1, {m, n}); - cudaStream_t stream = globalContext().getCurrentCUDAStream(); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); auto policy = thrust::cuda::par(allocator).on(stream); @@ -282,7 +283,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR dim3 grid; int curDevice = -1; cudaGetDevice(&curDevice); - cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice); + cudaStream_t stream = at::cuda::getCurrentCUDAStreamOnDevice(curDevice); if (sparse._denseDims() == 0) { AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); @@ -482,7 +483,7 @@ SparseTensor& s_mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, co dim3 grid; int curDevice = -1; cudaGetDevice(&curDevice); - cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice); + cudaStream_t stream = at::cuda::getCurrentCUDAStreamOnDevice(curDevice); AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); LongTensor resultNnz = at::empty({1}, CUDA(kLong)); diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp index 1f877e3e8b598..ba645de1f17ef 100644 --- a/aten/src/ATen/test/stream_test.cpp +++ b/aten/src/ATen/test/stream_test.cpp @@ -1,7 +1,8 @@ #define CATCH_CONFIG_MAIN #include "catch.hpp" -#include "ATen/ATen.h" +#include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/CUDAGuard.h" #include "cuda_runtime.h" @@ -18,9 +19,9 @@ TEST_CASE( cudaStream_t cuda_stream; // Tests that copying works as expected and preserves the stream - at::CUDAStream copyStream; + at::cuda::CUDAStream copyStream; { - auto s = at::globalContext().createCUDAStream(); + auto s = at::cuda::createCUDAStream(); device = s.device(); cuda_stream = s.stream(); @@ -36,9 +37,9 @@ TEST_CASE( REQUIRE(copyStream.stream() == cuda_stream); // Tests that moving works as expected and preserves the stream - at::CUDAStream moveStream; + at::cuda::CUDAStream moveStream; { - auto s = at::globalContext().createCUDAStream(); + auto s = at::cuda::createCUDAStream(); device = s.device(); cuda_stream = s.stream(); @@ -54,50 +55,50 @@ TEST_CASE( } TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { - at::CUDAStream myStream = at::globalContext().createCUDAStream(); + at::cuda::CUDAStream myStream = at::cuda::createCUDAStream(); // Sets and gets - at::globalContext().setCurrentCUDAStream(myStream); - at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream(); + at::cuda::setCurrentCUDAStream(myStream); + at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream(); REQUIRE(myStream == curStream); // Gets, sets, and gets default stream - at::CUDAStream defaultStream = at::globalContext().getDefaultCUDAStream(); - at::globalContext().setCurrentCUDAStream(defaultStream); - curStream = at::globalContext().getCurrentCUDAStream(); + at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(); + at::cuda::setCurrentCUDAStream(defaultStream); + curStream = at::cuda::getCurrentCUDAStream(); REQUIRE(defaultStream != myStream); REQUIRE(curStream == defaultStream); } TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") { - auto ptr = at::detail::CUDAStream_createAndRetainWithOptions( - at::CUDAStream::DEFAULT_FLAGS, at::CUDAStream::DEFAULT_PRIORITY); + auto ptr = at::cuda::detail::CUDAStream_createAndRetainWithOptions( + at::cuda::CUDAStream::DEFAULT_FLAGS, at::cuda::CUDAStream::DEFAULT_PRIORITY); - at::detail::CUDAStream_free(ptr); + at::cuda::detail::CUDAStream_free(ptr); REQUIRE(ptr == nullptr); } -void thread_fun(at::CUDAStream& cur_thread_stream) { - auto new_stream = at::globalContext().createCUDAStream(); - at::globalContext().setCurrentCUDAStream(new_stream); - cur_thread_stream = at::globalContext().getCurrentCUDAStream(); +void thread_fun(at::cuda::CUDAStream& cur_thread_stream) { + auto new_stream = at::cuda::createCUDAStream(); + at::cuda::setCurrentCUDAStream(new_stream); + cur_thread_stream = at::cuda::getCurrentCUDAStream(); REQUIRE(cur_thread_stream == new_stream); } TEST_CASE( "Multithread Getting and Setting", "Ensures streams are thread local") { - at::CUDAStream s0, s1; + at::cuda::CUDAStream s0, s1; std::thread t0{thread_fun, std::ref(s0)}; std::thread t1{thread_fun, std::ref(s1)}; t0.join(); t1.join(); - at::CUDAStream cur_stream = at::globalContext().getCurrentCUDAStream(); - at::CUDAStream default_stream = at::globalContext().getDefaultCUDAStream(); + at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream(); + at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream(); REQUIRE(cur_stream == default_stream); REQUIRE(cur_stream != s0); @@ -106,94 +107,94 @@ TEST_CASE( } TEST_CASE("CUDAGuard") { - if (at::globalContext().getNumGPUs() < 2) { + if (at::cuda::getNumGPUs() < 2) { return; } // -- begin setup - REQUIRE(at::current_device() == 0); - std::vector streams0 = { - at::globalContext().getDefaultCUDAStream(), - at::globalContext().createCUDAStream()}; + REQUIRE(at::cuda::current_device() == 0); + std::vector streams0 = { + at::cuda::getDefaultCUDAStream(), + at::cuda::createCUDAStream()}; REQUIRE(streams0[0].device() == 0); REQUIRE(streams0[1].device() == 0); - at::globalContext().setCurrentCUDAStreamOnDevice(0, streams0[0]); + at::cuda::setCurrentCUDAStreamOnDevice(0, streams0[0]); - std::vector streams1; + std::vector streams1; { at::DeviceGuard device_guard(1); - streams1.push_back(at::globalContext().getDefaultCUDAStream()); - streams1.push_back(at::globalContext().createCUDAStream()); + streams1.push_back(at::cuda::getDefaultCUDAStream()); + streams1.push_back(at::cuda::createCUDAStream()); } REQUIRE(streams1[0].device() == 1); REQUIRE(streams1[1].device() == 1); - at::globalContext().setCurrentCUDAStreamOnDevice(1, streams1[0]); + at::cuda::setCurrentCUDAStreamOnDevice(1, streams1[0]); - REQUIRE(at::current_device() == 0); + REQUIRE(at::cuda::current_device() == 0); // -- end setup // Test that all original streams are recorded. { - at::CUDAGuard guard; + at::cuda::CUDAGuard guard; REQUIRE(guard.original_streams().empty()); guard.set_stream(streams0[0]); REQUIRE( - guard.original_streams().size() == at::globalContext().getNumGPUs()); + guard.original_streams().size() == at::cuda::getNumGPUs()); REQUIRE(guard.original_streams()[0] == streams0[0]); REQUIRE(guard.original_streams()[1] == streams1[0]); } // Setting a stream changes the current device and the stream on that device { - at::CUDAGuard guard(streams1[1]); + at::cuda::CUDAGuard guard(streams1[1]); REQUIRE(guard.last_device() == 1); - REQUIRE(at::current_device() == 1); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[1]); + REQUIRE(at::cuda::current_device() == 1); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[1]); } // Device and stream are now reset - REQUIRE(at::current_device() == 0); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); + REQUIRE(at::cuda::current_device() == 0); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]); // Setting only the device changes only the current device and not the stream { - at::CUDAGuard guard(/*device=*/1); + at::cuda::CUDAGuard guard(/*device=*/1); REQUIRE(guard.last_device() == 1); - REQUIRE(at::current_device() == 1); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); + REQUIRE(at::cuda::current_device() == 1); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]); } - REQUIRE(at::current_device() == 0); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]); + REQUIRE(at::cuda::current_device() == 0); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(0) == streams0[0]); // Setting the stream first, and then the device, first changes the devices // back, and then resets the stream on the initial device. { - at::CUDAGuard guard(streams0[1]); + at::cuda::CUDAGuard guard(streams0[1]); guard.set_device(1); } - REQUIRE(at::current_device() == 0); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]); - REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); + REQUIRE(at::cuda::current_device() == 0); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(0) == streams0[0]); + REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]); } TEST_CASE("CUDAGuardIsMovable") { - if (at::globalContext().getNumGPUs() < 2) { + if (at::cuda::getNumGPUs() < 2) { return; } - const auto stream = at::globalContext().createCUDAStream(); - const auto device_count = at::globalContext().getNumGPUs(); - at::CUDAGuard first(stream); + const auto stream = at::cuda::createCUDAStream(); + const auto device_count = at::cuda::getNumGPUs(); + at::cuda::CUDAGuard first(stream); first.set_device(1); - at::CUDAGuard second(std::move(first)); + at::cuda::CUDAGuard second(std::move(first)); REQUIRE(second.original_streams().size() == device_count); REQUIRE(second.original_device() == 0); REQUIRE(second.last_device() == 1); - at::CUDAGuard third; + at::cuda::CUDAGuard third; third = std::move(second); REQUIRE(third.original_streams().size() == device_count); REQUIRE(third.original_device() == 0); diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp index 7d400a296102f..b63e47d86eac7 100644 --- a/aten/src/THC/THCCachingAllocator.cpp +++ b/aten/src/THC/THCCachingAllocator.cpp @@ -1,7 +1,8 @@ #include "THCCachingAllocator.h" #include -#include +#include +#include #include #include @@ -507,7 +508,7 @@ struct CudaCachingAllocator : public at::Allocator { THCudaCheck(cudaGetDevice(&device)); void* r = nullptr; if (size != 0) { - AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::globalContext().getCurrentCUDAStreamOnDevice(device))); + AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::cuda::getCurrentCUDAStreamOnDevice(device))); } return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)}; } diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index bde2c393a6619..7449fcfe2df92 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -6,7 +6,7 @@ #include "THCTensorRandom.h" #include "THCGeneral.hpp" -#include "ATen/CUDAStream.h" +#include "ATen/cuda/CUDAStream.h" #include "THCCachingAllocator.h" #include @@ -426,29 +426,29 @@ cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int } THCStream* THCState_getStreamOnDevice(THCState* state, int device) { - return at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device); + return at::cuda::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device); } void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream) { - at::detail::CUDAStream_setStreamOnDevice(device, stream); + at::cuda::detail::CUDAStream_setStreamOnDevice(device, stream); } cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device) { - return at::detail::CUDAStream_stream( - at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device)); + return at::cuda::detail::CUDAStream_stream( + at::cuda::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device)); } cudaStream_t THCState_getCurrentStream(THCState *state) { - return at::detail::CUDAStream_stream( - at::detail::CUDAStream_getCurrentStreamUnsafe()); + return at::cuda::detail::CUDAStream_stream( + at::cuda::detail::CUDAStream_getCurrentStreamUnsafe()); } THCStream* THCState_getStream(THCState *state) { - return at::detail::CUDAStream_getCurrentStreamUnsafe(); + return at::cuda::detail::CUDAStream_getCurrentStreamUnsafe(); } void THCState_setStream(THCState *state, THCStream *stream) { - at::detail::CUDAStream_setStream(stream); + at::cuda::detail::CUDAStream_setStream(stream); } cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) diff --git a/aten/src/THC/THCStream.cpp b/aten/src/THC/THCStream.cpp index 01fc9b00274bc..b997c380c6606 100644 --- a/aten/src/THC/THCStream.cpp +++ b/aten/src/THC/THCStream.cpp @@ -1,32 +1,32 @@ #include "THCStream.h" -#include "ATen/CUDAStream.h" +#include "ATen/cuda/CUDAStream.h" THC_API THCStream* THCStream_defaultStream(int device) { - return at::detail::CUDAStream_getDefaultStreamOnDevice(device); + return at::cuda::detail::CUDAStream_getDefaultStreamOnDevice(device); } THC_API THCStream* THCStream_new(int flags) { - return THCStream_newWithPriority(flags, at::CUDAStream::DEFAULT_PRIORITY); + return THCStream_newWithPriority(flags, at::cuda::CUDAStream::DEFAULT_PRIORITY); } THC_API THCStream* THCStream_newWithPriority(int flags, int priority) { - return at::detail::CUDAStream_createAndRetainWithOptions(flags, priority); + return at::cuda::detail::CUDAStream_createAndRetainWithOptions(flags, priority); } THC_API cudaStream_t THCStream_stream(THCStream* stream) { - return at::detail::CUDAStream_stream(stream); + return at::cuda::detail::CUDAStream_stream(stream); } THC_API int THCStream_device(THCStream* stream) { - return at::detail::CUDAStream_device(stream); + return at::cuda::detail::CUDAStream_device(stream); } THC_API void THCStream_retain(THCStream* stream) { - at::detail::CUDAStream_retain(stream); + at::cuda::detail::CUDAStream_retain(stream); } THC_API void THCStream_free(THCStream* stream) { - at::detail::CUDAStream_free(stream); + at::cuda::detail::CUDAStream_free(stream); } diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp index 0a7bc0f9d8f8a..dbb662425922d 100644 --- a/test/cpp_extensions/cudnn_extension.cpp +++ b/test/cpp_extensions/cudnn_extension.cpp @@ -13,7 +13,7 @@ #include #include // for TensorDescriptor -#include // for CUDNN_CHECK +#include // for CUDNN_CHECK #include // for getCudnnHandle // Name of function in python module and name used for error messages by diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp index 00e140e81b083..052aba90dcc4f 100644 --- a/torch/csrc/autograd/functions/comm.cpp +++ b/torch/csrc/autograd/functions/comm.cpp @@ -19,7 +19,7 @@ Scatter::Scatter( std::vector devices, const at::optional>& chunk_sizes, int64_t dim, - const at::optional>& streams, + const at::optional>& streams, bool unsqueeze_scalars) : devices_(std::move(devices)), chunk_sizes_(chunk_sizes), diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index 7bbd24a169dcb..a7c24748ca01f 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -16,7 +16,7 @@ struct Scatter : public Function { std::vector devices, const at::optional>& chunk_sizes = at::nullopt, int64_t dim = 0, - const at::optional>& streams = at::nullopt, + const at::optional>& streams = at::nullopt, bool unsqueeze_scalars = false); variable_list apply(variable_list&& inputs) override; @@ -24,7 +24,7 @@ struct Scatter : public Function { std::vector devices_; at::optional> chunk_sizes_; int64_t dim_; - at::optional> streams_; + at::optional> streams_; bool unsqueeze_scalars_; }; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 3df34c728844b..c3abdfc3b8f2b 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -17,6 +17,7 @@ #include "ATen/ATen.h" #include "torch/csrc/cuda/cuda_check.h" #ifdef USE_CUDA +#include "ATen/cuda/CUDAContext.h" #include #endif @@ -51,7 +52,7 @@ struct Event { if(record_cuda) { TORCH_CUDA_CHECK(cudaGetDevice(&device_)); TORCH_CUDA_CHECK(cudaEventCreate(&event)); - auto stream = at::globalContext().getCurrentCUDAStream(); + auto stream = at::cuda::getCurrentCUDAStream(); cpu_ns_ = getTime(); TORCH_CUDA_CHECK(cudaEventRecord(event, stream)); } else { diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index c4372059f3ae7..30649f7f04bcc 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "ATen/cuda/CUDAContext.h" #include #ifdef USE_NCCL #include @@ -309,7 +310,7 @@ static void bindCudaDeviceProperties(PyObject* module) { return stream.str(); }); m.def("_get_device_properties", [](int device) -> cudaDeviceProp * { - return at::globalContext().getDeviceProperties(device); + return at::cuda::getDeviceProperties(device); }, py::return_value_policy::reference); } diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 52a27ea0a4473..9c3b8a305a5ae 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -8,6 +8,8 @@ #endif #include +#include +#include #include #include @@ -118,7 +120,7 @@ std::vector scatter( at::IntList devices, const at::optional>& chunk_sizes, int64_t dim, - const at::optional>& streams) { + const at::optional>& streams) { std::vector chunks; if (chunk_sizes) { const int64_t chunk_size_sum = @@ -140,7 +142,7 @@ std::vector scatter( } else { chunks = tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim); } - at::CUDAGuard cuda_guard; + at::cuda::CUDAGuard cuda_guard; for (size_t chunk = 0; chunk < chunks.size(); ++chunk) { const auto device_index = static_cast(devices[chunk]); if (streams) { @@ -150,7 +152,7 @@ std::vector scatter( chunk, " (was ", (*streams)[chunk].device(), ") ", "to match the device supplied at that index ", "(expected ", device_index, ")"); - cuda_guard.set_stream((*streams)[chunk]); + cuda_guard.set_stream(at::cuda::CUDAStream((*streams)[chunk])); } chunks[chunk] = chunks[chunk].contiguous().to( {at::kCUDA, device_index}, /*non_blocking=*/true); diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h index c7009a56e9712..caf7811afd1ae 100644 --- a/torch/csrc/cuda/comm.h +++ b/torch/csrc/cuda/comm.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -19,7 +20,7 @@ std::vector scatter( at::IntList devices, const at::optional>& chunk_sizes = at::nullopt, int64_t dim = 0, - const at::optional>& streams = at::nullopt); + const at::optional>& streams = at::nullopt); at::Tensor gather( at::TensorList tensors, diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp index 0ec849a749854..c9ed0bb3be434 100644 --- a/torch/csrc/cuda/python_comm.cpp +++ b/torch/csrc/cuda/python_comm.cpp @@ -28,14 +28,14 @@ void initCommMethods(PyObject *module) { at::optional> chunk_sizes, int64_t dim, at::optional py_streams) { - at::optional> streams; + at::optional> streams; if (py_streams) { py::handle handle = *py_streams; streams = fmap( THPUtils_PySequence_to_THCStreamList(handle.ptr()), [](THCStream* stream) { - at::detail::CUDAStream_retain(stream); - return at::CUDAStream(stream); + at::cuda::detail::CUDAStream_retain(stream); + return at::cuda::CUDAStream(stream); }); } // Note: We're holding the GIL up to here. diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 889a8c3fcfacc..3e04369987eaf 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -9,6 +9,7 @@ #include "ATen/ATen.h" #ifdef USE_CUDA +#include "ATen/cuda/CUDAContext.h" #include "THC/THC.h" #include "torch/csrc/cuda/cuda_check.h" #include @@ -651,7 +652,7 @@ struct CUDAFusionFunction : public CompiledFusionFunction { *(THCCachingAllocator_getCudaFreeMutex())); cudaFree(0); } - CUstream stream = at::globalContext().getCurrentCUDAStream(); + CUstream stream = at::cuda::getCurrentCUDAStream(); TORCH_CU_CHECK(cuLaunchKernel( function, numBlocks, 1, 1, From 01581037dca1176508f4544b377a4c0f2d0dd165 Mon Sep 17 00:00:00 2001 From: Kittipat Virochsiri Date: Fri, 20 Jul 2018 14:49:16 -0700 Subject: [PATCH 11/23] Add workspace.RunPlanInBackground (#9637) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9637 Adding a method to run plan in background. The intended use is to run BlueWhale's data reading & preprocessing net in background while the GPU is training. Reviewed By: MisterTea Differential Revision: D8906439 fbshipit-source-id: b1c73ca7327e2d87a8f873924e05ab3d161a3f1e --- caffe2/python/pybind_state.cc | 46 +++++++++++++++++++++++++++++++++ caffe2/python/workspace.py | 8 ++++++ caffe2/python/workspace_test.py | 9 +++++++ 3 files changed, 63 insertions(+) diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index 7c421ff2a870e..dca1cdf0fbf2f 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -1,5 +1,8 @@ #include "pybind_state.h" +#include +#include + #include #include @@ -245,6 +248,33 @@ OPERATOR_SCHEMA(PythonDLPackGradient).AllowInplace([](int, int) { }); REGISTER_GRADIENT(PythonDLPack, GetPythonGradient); +class BackgroundPlan { + public: + BackgroundPlan(Workspace* ws, PlanDef def) : ws_(ws), def_(def) {} + + void run() { + fut_ = + std::async(std::launch::async, [this]() { return ws_->RunPlan(def_); }); + } + + bool isDone() { + CAFFE_ENFORCE(fut_.valid()); + auto status = fut_.wait_for(std::chrono::milliseconds(0)); + return status == std::future_status::ready; + } + + bool isSucceeded() { + CAFFE_ENFORCE(isDone()); + return fut_.get(); + } + + private: + Workspace* ws_; + PlanDef def_; + + std::future fut_; +}; + void addObjectMethods(py::module& m) { py::class_(m, "Net").def("run", [](NetBase* net) { py::gil_scoped_release g; @@ -505,6 +535,11 @@ void addObjectMethods(py::module& m) { return py::cast(ws->second.get(), py::return_value_policy::reference); }); + py::class_>( + m, "BackgroundPlan") + .def("is_done", &BackgroundPlan::isDone) + .def("is_succeeded", &BackgroundPlan::isSucceeded); + // Gradients py::class_(m, "GradientWrapper") .def(py::init<>()) @@ -1183,6 +1218,17 @@ void addGlobalMethods(py::module& m) { CAFFE_ENFORCE(gWorkspace->RunPlan(def)); return true; }); + m.def("run_plan_in_background", [](const py::bytes& plan_def) { + CAFFE_ENFORCE(gWorkspace); + PlanDef def; + CAFFE_ENFORCE( + ParseProtoFromLargeString(plan_def.cast(), &def)); + py::gil_scoped_release g; + + auto background_plan = std::make_shared(gWorkspace, def); + background_plan->run(); + return background_plan; + }); m.def( "apply_transform", [](const string& transform_key, const py::bytes& net_def) { diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index 1c618ac9efabd..8c3c1729db232 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -228,6 +228,14 @@ def RunPlan(plan_or_step): return C.run_plan(StringifyProto(plan_or_step)) +def RunPlanInBackground(plan_or_step): + # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps + import caffe2.python.core as core + if isinstance(plan_or_step, core.ExecutionStep): + plan_or_step = core.Plan(plan_or_step) + return C.run_plan_in_background(StringifyProto(plan_or_step)) + + def InferShapesAndTypes(nets, blob_dimensions=None, nets_proto=False): """Infers the shapes and types for the specified nets. diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 78468ec8548af..5da37c7f22efc 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -83,6 +83,15 @@ def testRunPlan(self): workspace.RunPlan(plan.Proto().SerializeToString()), True) self.assertEqual(workspace.HasBlob("testblob"), True) + def testRunPlanInBackground(self): + plan = core.Plan("test-plan") + plan.AddStep(core.ExecutionStep("test-step", self.net)) + background_plan = workspace.RunPlanInBackground(plan) + while not background_plan.is_done(): + pass + self.assertEqual(background_plan.is_succeeded(), True) + self.assertEqual(workspace.HasBlob("testblob"), True) + def testConstructPlanFromSteps(self): step = core.ExecutionStep("test-step-as-plan", self.net) self.assertEqual(workspace.RunPlan(step), True) From 45e5c17ecfbe00708e2c61488d688b3d651ab8ca Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 20 Jul 2018 15:08:02 -0700 Subject: [PATCH 12/23] ONNXIFI transform (#9569) Summary: Cut-off runnable subgraph and off-load to ONNXIFI backend Pull Request resolved: https://github.com/pytorch/pytorch/pull/9569 Reviewed By: Maratyszcza Differential Revision: D8930408 Pulled By: yinghai fbshipit-source-id: 2b494f7f8dc10c00e58cf0fed5c4a9434be6155b --- caffe2/operators/onnxifi_op.cc | 4 + caffe2/operators/onnxifi_op.h | 5 - caffe2/opt/onnxifi_transformer.cc | 375 +++++++++++++++++++++++++++++ caffe2/opt/onnxifi_transformer.h | 63 +++++ caffe2/python/onnx/onnxifi.py | 61 +++++ caffe2/python/onnx/test_onnxifi.py | 156 ++++++++++++ caffe2/python/pybind_state.cc | 22 ++ caffe2/python/trt/transform.py | 13 +- 8 files changed, 689 insertions(+), 10 deletions(-) create mode 100644 caffe2/opt/onnxifi_transformer.cc create mode 100644 caffe2/opt/onnxifi_transformer.h create mode 100644 caffe2/python/onnx/onnxifi.py diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc index f4d0c5eafbd69..36a4ae90c3271 100644 --- a/caffe2/operators/onnxifi_op.cc +++ b/caffe2/operators/onnxifi_op.cc @@ -91,6 +91,10 @@ bool OnnxifiOp::RunOnDevice() { tensor_descriptor.dataType = ONNXIFI_DATATYPE_FLOAT32; tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU; tensor_descriptor.dimensions = tensor_dims.size(); + CAFFE_ENFORCE( + tensor_descriptor.dimensions != 0, + tensor_descriptor.name, + " has 0 dim"); output_shapes_.emplace_back(tensor_dims.cbegin(), tensor_dims.cend()); tensor_descriptor.shape = output_shapes_.back().data(); tensor_descriptor.buffer = diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h index 3c5cd2dbc36e6..d2a72d006018a 100644 --- a/caffe2/operators/onnxifi_op.h +++ b/caffe2/operators/onnxifi_op.h @@ -73,11 +73,6 @@ class OnnxifiOp final : public Operator { auto weight_descs = BuildInitializationList( &mapped_ws, &initializer_set, &weight_names, &weight_shapes); - ::ONNX_NAMESPACE::ModelProto onnx_model; - ParseProtoFromLargeString(onnx_model_str, &onnx_model); - onnx_model_str.clear(); - onnx_model.SerializeToString(&onnx_model_str); - // Build the Onnxifi engine // TODO: In spec, backends are hot-pluggable, so two calls to // onnxGetBackendIDs may result in different number of backend. And we diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc new file mode 100644 index 0000000000000..c1c6c310786c5 --- /dev/null +++ b/caffe2/opt/onnxifi_transformer.cc @@ -0,0 +1,375 @@ +#include "caffe2/opt/onnxifi_transformer.h" + +#include +#include + +#include + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/onnx/onnx_exporter.h" +#include "caffe2/opt/backend_cutting.h" + +namespace caffe2 { + +namespace { + +// TODO(yinghai): Remove the awkward conversion between unordered_map and map +std::unordered_map InferShapes( + Workspace* ws, + NetDef* pred_net, + CaffeMap* shape_hints_ordered) { + // Populate shapes from workplace + const std::vector& ws_blobs = ws->Blobs(); + for (const auto& s : ws_blobs) { + auto shape = GetTensorShapeOfBlob(ws->GetBlob(s)); + if (!shape.unknown_shape()) { + shape_hints_ordered->emplace(s, std::move(shape)); + } + } + + std::vector nets; + nets.emplace_back(pred_net); + InferBlobShapesAndTypes(*shape_hints_ordered, nets); + std::unordered_map shape_hints; + for (const auto& kv : *shape_hints_ordered) { + shape_hints.emplace(kv.first, kv.second); + } + + return shape_hints; +} + +void DumpModel( + const ::ONNX_NAMESPACE::ModelProto& model, + const std::string& fname) { + std::ofstream ff(fname); + std::string body; + ::google::protobuf::TextFormat::PrintToString(model.graph(), &body); + ff << body << std::endl; + ff.close(); +} + +std::vector<::ONNX_NAMESPACE::ValueInfoProto> ConvertToValueInfo( + const std::vector& names, + const std::unordered_map& shape_hints) { + std::vector<::ONNX_NAMESPACE::ValueInfoProto> r; + for (const auto& s : names) { + r.emplace_back(); + auto& value_info = r.back(); + value_info.set_name(s); + const auto it = shape_hints.find(s); + if (it == shape_hints.end()) { + LOG(WARNING) << "Cannot get shape of " << s; + } else { + auto* tensor_type = value_info.mutable_type()->mutable_tensor_type(); + tensor_type->set_elem_type( + ::ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT); + auto* shape = tensor_type->mutable_shape(); + for (int i = 0; i < it->second.dims().size(); ++i) { + shape->add_dim()->set_dim_value(it->second.dims(i)); + } + } + } + return r; +} + +void FillModelInfo(::ONNX_NAMESPACE::ModelProto* model) { + model->set_ir_version(::ONNX_NAMESPACE::Version::IR_VERSION); + model->set_producer_name("caffe2"); + auto* opset_id = model->add_opset_import(); + opset_id->set_domain(""); + opset_id->set_version(3); +} +} // namespace + +OnnxifiTransformer::OnnxifiTransformer(bool debug) : debug_(debug) { + lib_ = onnx::initOnnxifiLibrary(); + CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library"); + CAFFE_ENFORCE_EQ( + lib_->onnxGetBackendIDs(nullptr, &num_backends_), + ONNXIFI_STATUS_FALLBACK); + CAFFE_ENFORCE_GT( + num_backends_, 0, "At least 1 onnxifi backend should be available"); + backend_ids_.resize(num_backends_); + CAFFE_ENFORCE_EQ( + lib_->onnxGetBackendIDs(backend_ids_.data(), &num_backends_), + ONNXIFI_STATUS_SUCCESS); +} + +OperatorDef OnnxifiTransformer::BuildOnnxifiOp( + const std::string& onnx_model_str, + const std::unordered_map>& output_size_hints, + const std::unordered_set& initialization_list, + const caffe2::NetDef& net) { + OperatorDef op; + op.set_type("Onnxifi"); + auto* onnx_model_arg = op.add_arg(); + onnx_model_arg->set_name("onnx_model"); + onnx_model_arg->set_s(onnx_model_str); + + // Add the names of the initializer blobs that we want to fetch from the + // workspace later + auto* initializers_arg = op.add_arg(); + initializers_arg->set_name("initializers"); + for (const auto& s : initialization_list) { + initializers_arg->add_strings(s); + initializers_arg->add_strings(input_mapping_.at(s)); + } + + // Add the input/output + for (const auto& input : net.external_input()) { + if (!initialization_list.count(input)) { + op.add_input(input); + } + } + for (const auto& output : net.external_output()) { + op.add_output(output); + } + + // Add output size hints + for (int i = 0; i < op.output_size(); ++i) { + const auto& o = op.output(i); + const auto it = output_size_hints.find(o); + if (it != output_size_hints.end()) { + const auto& dims = it->second; + auto* output_size_hint_arg = op.add_arg(); + output_size_hint_arg->set_name(MakeString("output_size_hint_", i)); + for (const auto& d : dims) { + output_size_hint_arg->add_ints(d); + } + + VLOG(2) << "Adding output hint: " << o; + } + } + return op; +} + +NetDef OnnxifiTransformer::SubnetToOnnxifiOp( + const caffe2::NetDef& net, + Workspace* ws, + onnx::OnnxExporter* exporter, + std::unordered_map* shape_hints) { + ::ONNX_NAMESPACE::ModelProto onnx_model; + FillModelInfo(&onnx_model); + + // Convert c2 ops to onnx ops, add const weights if there are any + DeviceOption option; + CPUContext context(option); + context.SwitchToDevice(); + for (const auto& op : net.op()) { + const auto results = exporter->Caffe2OpToOnnxNodes(op, *shape_hints); + for (const auto& n : results.first) { + onnx_model.mutable_graph()->add_node()->CopyFrom(n); + } + for (const auto& t : results.second) { + VLOG(2) << "Adding extra init tensor: " << t.name(); + CAFFE_ENFORCE_EQ( + t.data_type(), + ::ONNX_NAMESPACE::TensorProto::FLOAT, + "Only supports conversion of float type for now"); + TensorShape shape; + shape.mutable_dims()->CopyFrom(t.dims()); + shape_hints->emplace(t.name(), std::move(shape)); + + // Feed into workspace as CPU Tensors + auto* blob = ws->CreateBlob(t.name()); + auto* cpu_tensor = blob->GetMutable(); + std::vector dims; + std::copy(t.dims().begin(), t.dims().end(), dims.begin()); + cpu_tensor->Resize(dims); + context.template CopyBytes( + cpu_tensor->size() * sizeof(float), + static_cast(t.raw_data().data()), + cpu_tensor->raw_mutable_data(TypeMeta::Make())); + context.FinishDeviceComputation(); + + // Add mappings + CAFFE_ENFORCE( + input_mapping_.emplace(t.name(), t.name()).second, + MakeString("Tensor ", t.name(), " already exists in the workspace")); + } + } + + // Convert outputs and compute output shape hints + std::vector io_names; + for (const auto& output : net.external_output()) { + io_names.emplace_back(output); + } + auto io_vec = ConvertToValueInfo(io_names, *shape_hints); + std::unordered_map> output_shape_hints; + for (const auto& i : io_vec) { + onnx_model.mutable_graph()->add_output()->CopyFrom(i); + auto ret = output_shape_hints.emplace(i.name(), std::vector()); + auto& vec = ret.first->second; + const auto it = shape_hints->find(i.name()); + CAFFE_ENFORCE( + it != shape_hints->end(), + "Cannot find shape info for output ", + i.name()); + const auto& shape = it->second; + for (int k = 0; k < shape.dims().size(); ++k) { + vec.push_back(shape.dims(k)); + } + } + + // Convert inputs and figure out weights + std::unordered_set weights; + const std::vector& ws_blobs = ws->Blobs(); + for (const auto& s : ws_blobs) { + VLOG(2) << "Add weights: " << s; + weights.emplace(s); + } + + std::unordered_set total_inputs; + std::unordered_set initialization_list; + std::vector total_inputs_vec; + + // Extra intermediate weights created during conversion + for (const auto& extra_weight : onnx_model.graph().initializer()) { + if (total_inputs.emplace(extra_weight.name()).second) { + total_inputs_vec.emplace_back(extra_weight.name()); + } + } + // Boundary inputs, should not be weights + std::unordered_set boundary_inputs; + for (const auto& i : net.external_input()) { + boundary_inputs.emplace(i); + } + + for (const auto& op : net.op()) { + for (const auto& input : op.input()) { + if (total_inputs.emplace(input).second && weights.count(input)) { + // We add weights as inputs too + total_inputs_vec.emplace_back(input); + initialization_list.emplace(input); + VLOG(2) << "Add input weights: " << input; + } else if (boundary_inputs.count(input)) { + VLOG(2) << "Adding boundary input: " << input; + total_inputs_vec.emplace_back(input); + } + } + } + io_vec = ConvertToValueInfo(total_inputs_vec, *shape_hints); + for (const auto& i : io_vec) { + onnx_model.mutable_graph()->add_input()->CopyFrom(i); + } + + // Debugging stuff + if (debug_) { + DumpModel(onnx_model, "debug.onnxtxt"); + } + + // Onnx model is ready. Build ONNXIFI Op + std::string model_str; + onnx_model.SerializeToString(&model_str); + NetDef net_opt; + auto* op = net_opt.add_op(); + *op = BuildOnnxifiOp(model_str, output_shape_hints, initialization_list, net); + for (const auto& i : op->input()) { + net_opt.add_external_input(i); + } + for (const auto& i : op->output()) { + net_opt.add_external_output(i); + } + + return net_opt; +} + +CaffeMap OnnxifiTransformer::SsaRewriteAndMapNames( + Workspace* ws, + NetDef* pred_net, + const std::unordered_map& input_shape_hints) { + input_mapping_ = onnx::SsaRewrite(nullptr, pred_net); + std::unordered_map input_reverse_mapping; + std::vector external_inputs; + for (const auto kv : input_mapping_) { + input_reverse_mapping.emplace(kv.second, kv.first); + if (!ws->HasBlob(kv.second)) { + external_inputs.emplace_back(kv.first); + } + } + for (const auto& i : external_inputs) { + input_mapping_.erase(i); + } + CaffeMap shape_hints_ordered; + for (const auto& kv : input_shape_hints) { + const auto it = input_reverse_mapping.find(kv.first); + if (it != input_reverse_mapping.end()) { + LOG(INFO) << "Adding input hint: " << it->second; + shape_hints_ordered.emplace(it->second, kv.second); + } else { + shape_hints_ordered.emplace(kv.first, kv.second); + } + } + return shape_hints_ordered; +} + +// Cutting off the runnable part and replace with ONNXIFI ops. Asssume the nets +// were topologically sorted +void OnnxifiTransformer::Transform( + Workspace* ws, + NetDef* pred_net, + const std::unordered_map& input_shape_hints) { + CAFFE_ENFORCE(ws); + auto shape_hints_ordered = + SsaRewriteAndMapNames(ws, pred_net, input_shape_hints); + Workspace mapped_ws(ws, input_mapping_); + auto shape_hints = InferShapes(&mapped_ws, pred_net, &shape_hints_ordered); + + CAFFE_ENFORCE(pred_net, "Predict net cannot be nullptr"); + onnx::OnnxExporter exporter(nullptr, true); + + // function to tell whether the ONNXIFI backend supports a given C2 op or not + // TODO: choose backend id + auto supports = + [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]]( + const caffe2::OperatorDef& op) { + const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); + // NB: this might not be a hard constraint as we can just export C2 + // domain specific ops to ONNX + if (!schema || schema->onnx_schema().empty()) { + LOG(INFO) << "Cannot export c2 op " << op.type() + << " to onnx as there is no corresponding ONNX schema."; + return false; + } + + ::ONNX_NAMESPACE::ModelProto onnx_model; + FillModelInfo(&onnx_model); + auto results = exporter.Caffe2OpToOnnxNodes(op, shape_hints); + for (const auto& n : results.first) { + onnx_model.mutable_graph()->add_node()->CopyFrom(n); + } + std::string onnx_model_str; + onnx_model.SerializeToString(&onnx_model_str); + auto ret = backend->onnxGetBackendCompatibility( + backend_id, onnx_model_str.size(), onnx_model_str.c_str()); + if (ret != ONNXIFI_STATUS_SUCCESS) { + LOG(INFO) << "Don't support onnx for " << op.type() << " c2 op (" + << ret << ")"; + return false; + } else { + return true; + } + }; + + // function to convert runnbale subgraph into a trt op. Note that to keep the + // interface clean, we do the double conversion from C2 op to Onnx ops here + // but it should be OK as the cost is really small. We also need to keep the + // same exporter throughout the process to avoid duplicated dummy name + // generation + onnx::OnnxExporter exporter2(nullptr, true); + auto trt_converter = [this, &mapped_ws, &shape_hints, &exporter2]( + const caffe2::NetDef& net) mutable { + return SubnetToOnnxifiOp(net, &mapped_ws, &exporter2, &shape_hints); + }; + + NetDef net_opt = opt::OptimizeForBackend(*pred_net, supports, trt_converter); + + // Need to figure out a proper place to handle device option + net_opt.mutable_device_option()->CopyFrom(pred_net->device_option()); + pred_net->Swap(&net_opt); +} + +} // namespace caffe2 diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h new file mode 100644 index 0000000000000..197a026e3a8b9 --- /dev/null +++ b/caffe2/opt/onnxifi_transformer.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include +#include + +#include "onnx/onnx_pb.h" + +#include "caffe2/core/common.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/workspace.h" +#include "caffe2/onnx/onnxifi_init.h" +#include "caffe2/proto/caffe2.pb.h" + +namespace caffe2 { +namespace onnx { +class OnnxExporter; +} + +class OnnxifiTransformer { + public: + explicit OnnxifiTransformer(bool debug); + + void Transform( + Workspace* ws, + NetDef* pred_net, + const std::unordered_map& shape_hints); + + private: + caffe2::NetDef SubnetToOnnxifiOp( + const caffe2::NetDef& net, + Workspace* ws, + onnx::OnnxExporter* exporter, + std::unordered_map* shape_hints); + + OperatorDef BuildOnnxifiOp( + const std::string& onnx_model_str, + const std::unordered_map>& + output_size_hints, + const std::unordered_set& initialization_list, + const caffe2::NetDef& net); + + CaffeMap SsaRewriteAndMapNames( + Workspace* ws, + NetDef* pred_net, + const std::unordered_map& input_shape_hints); + + // Dump onnx model for debugging + bool debug_{false}; + + // Pointer to loaded onnxifi library + onnxifi_library* lib_{nullptr}; + + // Number of backends + size_t num_backends_{0}; + + // Backned IDs + std::vector backend_ids_; + // Input mapping + std::unordered_map input_mapping_; +}; +} // namespace caffe2 diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py new file mode 100644 index 0000000000000..8a132698e76de --- /dev/null +++ b/caffe2/python/onnx/onnxifi.py @@ -0,0 +1,61 @@ +## @package onnx +#Module caffe2.python.onnx.onnxifi + +""" +ONNXIFI a Caffe2 net +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core, workspace +import caffe2.python._import_c_extension as C +import numpy as np + + +def _infer_shapes(pred_net, inputs): + workspace.RunNetOnce(pred_net) + hints = {} + for op in pred_net.op: + for o in op.output: + if o not in hints: + blob = workspace.FetchBlob(o) + if hasattr(blob, 'shape'): + hints[o] = blob.shape + for i in op.input: + if i not in hints: + blob = workspace.FetchBlob(i) + if hasattr(blob, 'shape'): + hints[i] = blob.shape + + return hints + + +def onnxifi_caffe2_net( + pred_net, + input_shapes, + populate_shapes=False, + debug=False): + """ + Transfrom the caffe2_net by collapsing ONNXIFI-runnable nodes into Onnxifi c2 ops + """ + # Hacky way to infer shapes as not all our operators have shape inference function. + # Normally this is not needed + shape_hints = {} + if populate_shapes: + input_data = {} + for k, v in input_shapes.items(): + input_data[k] = np.random.randn(*v).astype(np.float32) + shape_hints = _infer_shapes(pred_net, input_data) + + for k, v in input_shapes.items(): + shape_hints[k] = v + pred_net_str = C.onnxifi(pred_net.SerializeToString(), + shape_hints, + debug) + pred_net_cut = caffe2_pb2.NetDef() + pred_net_cut.ParseFromString(pred_net_str) + return pred_net_cut diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py index 002287cf3b839..596dcde044d5f 100644 --- a/caffe2/python/onnx/test_onnxifi.py +++ b/caffe2/python/onnx/test_onnxifi.py @@ -3,16 +3,36 @@ from __future__ import print_function from __future__ import unicode_literals +import json import numpy as np +import os +import time import unittest import onnx import onnx.defs +from onnx.backend.base import namedtupledict from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace +from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory +from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net from caffe2.python.onnx.tests.test_utils import TestCase + +def _print_net(net): + for i in net.external_input: + print("Input: {}".format(i)) + for i in net.external_output: + print("Output: {}".format(i)) + for op in net.op: + print("Op {}".format(op.type)) + for x in op.input: + print(" input: {}".format(x)) + for y in op.output: + print(" output: {}".format(y)) + + class OnnxifiTest(TestCase): @unittest.skip("Need ONNXIFI backend support") def test_relu_graph(self): @@ -80,3 +100,139 @@ def test_conv_graph(self): np.testing.assert_almost_equal(Y, Y_without_padding) +class OnnxifiTransformTest(TestCase): + def _model_dir(self, model): + caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2')) + models_dir = os.getenv('CAFFE2_MODELS', os.path.join(caffe2_home, 'models')) + return os.path.join(models_dir, model) + + def _download(self, model): + model_dir = self._model_dir(model) + assert not os.path.exists(model_dir) + os.makedirs(model_dir) + for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']: + url = getURLFromName(model, f) + dest = os.path.join(model_dir, f) + try: + try: + downloadFromURLToFile(url, dest, + show_progress=False) + except TypeError: + # show_progress not supported prior to + # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1 + # (Sep 17, 2017) + downloadFromURLToFile(url, dest) + except Exception as e: + print("Abort: {reason}".format(reason=e)) + print("Cleaning up...") + deleteDirectory(model_dir) + exit(1) + + # TODO: we need to modulize this function + def _get_c2_model(self, model_name): + model_dir = self._model_dir(model_name) + if not os.path.exists(model_dir): + self._download(model_name) + c2_predict_pb = os.path.join(model_dir, 'predict_net.pb') + c2_predict_net = caffe2_pb2.NetDef() + with open(c2_predict_pb, 'rb') as f: + c2_predict_net.ParseFromString(f.read()) + c2_predict_net.name = model_name + + c2_init_pb = os.path.join(model_dir, 'init_net.pb') + c2_init_net = caffe2_pb2.NetDef() + with open(c2_init_pb, 'rb') as f: + c2_init_net.ParseFromString(f.read()) + c2_init_net.name = model_name + '_init' + + value_info = json.load(open(os.path.join(model_dir, 'value_info.json'))) + return c2_init_net, c2_predict_net, value_info + + def _add_head_tail(self, pred_net, new_head, new_tail): + orig_head = pred_net.external_input[0] + orig_tail = pred_net.external_output[0] + + # Add head + head = caffe2_pb2.OperatorDef() + head.type = "Copy" + head.input.append(new_head) + head.output.append(orig_head) + dummy = caffe2_pb2.NetDef() + dummy.op.extend(pred_net.op) + del pred_net.op[:] + pred_net.op.extend([head]) + pred_net.op.extend(dummy.op) + pred_net.external_input[0] = new_head + + # Add tail + tail = caffe2_pb2.OperatorDef() + tail.type = "Copy" + tail.input.append(orig_tail) + tail.output.append(new_tail) + pred_net.op.extend([tail]) + pred_net.external_output[0] = new_tail + + @unittest.skip("Need ONNXIFI backend support") + def test_resnet50_core(self): + N = 1 + repeat = 1 + print("Batch size: {}, repeat inference {} times".format(N, repeat)) + init_net, pred_net, _ = self._get_c2_model('resnet50') + self._add_head_tail(pred_net, 'real_data', 'real_softmax') + input_blob_dims = (N, 3, 224, 224) + input_name = "real_data" + + device_option = core.DeviceOption(caffe2_pb2.CPU, 0) + init_net.device_option.CopyFrom(device_option) + pred_net.device_option.CopyFrom(device_option) + for op in pred_net.op: + op.device_option.CopyFrom(device_option) + net_outputs = pred_net.external_output + Y_c2 = None + data = np.random.randn(*input_blob_dims).astype(np.float32) + c2_time = 1 + workspace.SwitchWorkspace("onnxifi_test", True) + with core.DeviceScope(device_option): + workspace.FeedBlob(input_name, data) + workspace.RunNetOnce(init_net) + workspace.CreateNet(pred_net) + start = time.time() + for _ in range(repeat): + workspace.RunNet(pred_net.name) + end = time.time() + c2_time = end - start + output_values = [workspace.FetchBlob(name) for name in net_outputs] + Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) + workspace.ResetWorkspace() + + # Fill the workspace with the weights + with core.DeviceScope(device_option): + workspace.RunNetOnce(init_net) + + # Cut the graph + start = time.time() + pred_net_cut = onnxifi_caffe2_net(pred_net, + {input_name: input_blob_dims}) + del init_net, pred_net + #_print_net(pred_net_cut) + + Y_trt = None + input_name = pred_net_cut.external_input[0] + print("C2 runtime: {}s".format(c2_time)) + with core.DeviceScope(device_option): + workspace.FeedBlob(input_name, data) + workspace.CreateNet(pred_net_cut) + end = time.time() + print("Conversion time: {:.2f}s".format(end - start)) + + start = time.time() + for _ in range(repeat): + workspace.RunNet(pred_net_cut.name) + end = time.time() + trt_time = end - start + print("Onnxifi runtime: {}s, improvement: {}%".format(trt_time, (c2_time - trt_time) / c2_time * 100)) + output_values = [workspace.FetchBlob(name) for name in net_outputs] + Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) + np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3) + + diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index dca1cdf0fbf2f..48b316c771d85 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -24,6 +24,7 @@ #include "caffe2/opt/converter.h" #include "caffe2/opt/fusion.h" #include "caffe2/opt/mobile.h" +#include "caffe2/opt/onnxifi_transformer.h" #include "caffe2/opt/optimize_ideep.h" #include "caffe2/opt/passes.h" #include "caffe2/opt/sink.h" @@ -1555,6 +1556,27 @@ void addGlobalMethods(py::module& m) { new_proto.SerializeToString(&out); return py::bytes(out); }); + m.def( + "onnxifi", + [](const py::bytes& pred_net_str, + const std::unordered_map>& shapes, + bool debug_builder) -> py::bytes { + caffe2::NetDef pred_net; + CAFFE_ENFORCE( + ParseProtoFromLargeString( + pred_net_str.cast(), &pred_net), + "broken pred_net protobuf"); + std::unordered_map tensor_shapes; + for (const auto& it : shapes) { + tensor_shapes.emplace( + it.first, CreateTensorShape(it.second, TensorProto::FLOAT)); + } + OnnxifiTransformer ts(debug_builder); + ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes); + std::string pred_net_str2; + pred_net.SerializeToString(&pred_net_str2); + return py::bytes(pred_net_str2); + }); m.def( "run_workspace_transform", [](const std::string& transform_name, py::bytes def) { diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py index 128d303a8f44a..fbc6c11026fba 100644 --- a/caffe2/python/trt/transform.py +++ b/caffe2/python/trt/transform.py @@ -53,23 +53,26 @@ def convert_onnx_model_to_trt_op(onnx_model, op.ParseFromString(trt_str) return op -def _infer_shapes(init_net, pred_net, inputs): - ws, outputs = c2_native_run_net(init_net, pred_net, inputs) + +# Assume the workspace is already filled with init weights +def _infer_shapes(pred_net, inputs): + workspace.RunNetOnce(pred_net) hints = {} for op in pred_net.op: for o in op.output: if o not in hints: - blob = ws.FetchBlob(o) + blob = workspace.FetchBlob(o) if hasattr(blob, 'shape'): hints[o] = blob.shape for i in op.input: if i not in hints: - blob = ws.FetchBlob(i) + blob = workspace.FetchBlob(i) if hasattr(blob, 'shape'): hints[i] = blob.shape return hints + def transform_caffe2_net( pred_net, input_shapes, @@ -91,7 +94,7 @@ def transform_caffe2_net( input_data = {} for k,v in input_shapes.items(): input_data[k] = np.random.randn(*v).astype(np.float32) - shape_hints = _infer_shapes(init_net, pred_net, input_data) + shape_hints = _infer_shapes(pred_net, input_data) for k,v in input_shapes.items(): shape_hints[k] = v From 3efdece9daade24630c72ebb7b17502134995196 Mon Sep 17 00:00:00 2001 From: Gregory Chanan Date: Fri, 20 Jul 2018 15:37:22 -0700 Subject: [PATCH 13/23] Support n-dimensional empty tensors in take/put. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9635 Differential Revision: D8935119 Pulled By: gchanan fbshipit-source-id: 5035583e7322b1a1720d961945dd0eefb4cb28ef --- aten/src/THC/THCTensorTypeUtils.cuh | 2 +- aten/src/THC/generic/THCTensorIndex.cu | 20 ++++++++------------ test/test_torch.py | 22 ++++++++++++++++++++++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh index 6ff6d680edbde..99266b3b4b931 100644 --- a/aten/src/THC/THCTensorTypeUtils.cuh +++ b/aten/src/THC/THCTensorTypeUtils.cuh @@ -60,7 +60,7 @@ getTensorInfo(THCState* state, TensorType* t) { IndexType sz[MAX_CUTORCH_DIMS]; IndexType st[MAX_CUTORCH_DIMS]; - int dims = THCTensor__nDimension(state, t); + int dims = THCTensor_nDimension(state, t); for (int i = 0; i < dims; ++i) { sz[i] = THCTensor_size(state, t, i); st[i] = THCTensor_stride(state, t, i); diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index f93ad4dfe14dd..1d934595aabd2 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -218,18 +218,14 @@ void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLon THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); - THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2, + THArgCheck(THCTensor_(nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCTensor_(nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCudaLongTensor_nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(!(THCTensor_(numel)(state, src) == 0 && THCudaLongTensor_numel(state, index) != 0), 2, "tried to take from an empty tensor"); THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL); - - // dispatchTakePut only handles non-empty tensors; - if (index->_dim() > 0) { - dispatchTakePut(state, src, dst, index); - } + dispatchTakePut(state, src, dst, index); } static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) { @@ -255,9 +251,9 @@ void THCTensor_(put)(THCState *state, THCTensor *dst, THCudaLongTensor *index, T THArgCheck(THCTensor_(nElement)(state, src) == numIndices, 3, "src should have the same number of elements as index"); - THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCTensor_(nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCTensor_(nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); + THArgCheck(THCudaLongTensor_nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); if (numIndices == 0) { return; diff --git a/test/test_torch.py b/test/test_torch.py index d1370e30cb826..7148ab69f82ec 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5661,6 +5661,16 @@ def check(src, idx): check(src, idx) check(src.transpose(1, 2), idx) + @skipIfNoZeroSize + def test_take_empty(self): + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in devices: + for input_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]: + for indices_shape in [(0,), (0, 1, 2, 0)]: + input = torch.empty(input_shape, device=device) + indices = torch.empty(indices_shape, dtype=torch.int64, device=device) + self.assertEqual(indices, torch.take(input, indices)) + def test_put_(self): def check(dst, idx, value): expected = dst.clone().view(-1).index_copy_( @@ -5682,6 +5692,18 @@ def test_put_accumulate(self): dst.put_(idx, src, accumulate=True) self.assertEqual(dst.tolist(), [[5, 7], [1, 1]]) + @skipIfNoZeroSize + def test_put_empty(self): + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in devices: + for dst_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]: + for indices_shape in [(0,), (0, 1, 2, 0)]: + for accumulate in [False, True]: + dst = torch.randn(dst_shape, device=device) + indices = torch.empty(indices_shape, dtype=torch.int64, device=device) + src = torch.randn(indices_shape, device=device) + self.assertEqual(dst, dst.put_(indices, src, accumulate=accumulate)) + # Fill idx with valid indices. @staticmethod def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o): From 5e84403d5fc92ad2ed594b86a436c571dac55d30 Mon Sep 17 00:00:00 2001 From: Ashish Date: Fri, 20 Jul 2018 17:10:20 -0700 Subject: [PATCH 14/23] Fix for half conversion for ROCm 1.8.2 (#9663) Summary: This PR contains the change for explicit conversion between ushort and __half required for ROCm 1.8.2 support bddppq Pull Request resolved: https://github.com/pytorch/pytorch/pull/9663 Differential Revision: D8943937 Pulled By: bddppq fbshipit-source-id: 16102f9dbc68ed4ece2e8fc244825c3992c24901 --- caffe2/utils/conversions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h index fbe30fbadac90..0b63b47ade393 100644 --- a/caffe2/utils/conversions.h +++ b/caffe2/utils/conversions.h @@ -187,7 +187,7 @@ CONVERSIONS_DECL float16 To(const float in) { #endif // CUDA_VERSION >= 9000 #elif __HIP_DEVICE_COMPILE__ float16 ret; - ret.x = __float2half(in); + ret.x = __half_as_ushort(__float2half(in)); return ret; #else return cpu_float2half_rn(in); @@ -206,7 +206,7 @@ CONVERSIONS_DECL float To(const float16 in) { return __half2float(tmp); #elif __HIP_DEVICE_COMPILE__ __half tmp; - tmp = in.x; + tmp = __ushort_as_half(in.x); return __half2float(tmp); #else return cpu_half2float(in); From 23ed26a0c399c121fc1bcb5824dff33d8eb463b4 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 20 Jul 2018 19:26:17 -0700 Subject: [PATCH 15/23] Guard include of cuda-only header comm.h (#9656) Summary: Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/9656 Reviewed By: colesbury Differential Revision: D8941361 Pulled By: ezyang fbshipit-source-id: c18cb0e606ae0608e5892040192b8792ae542b74 --- test/cpp/api/parallel.cpp | 4 ++++ torch/csrc/api/include/torch/nn/parallel/data_parallel.h | 4 ++++ torch/csrc/autograd/functions/comm.cpp | 4 ++++ torch/csrc/autograd/functions/comm.h | 4 ++++ torch/csrc/cuda/comm.cpp | 4 ++++ torch/csrc/cuda/comm.h | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp index 2d8f413c05300..1c11e87a533ef 100644 --- a/test/cpp/api/parallel.cpp +++ b/test/cpp/api/parallel.cpp @@ -17,6 +17,8 @@ using Catch::StartsWith; using namespace torch::autograd; using namespace torch::nn; +#ifdef USE_CUDA + TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") { Scatter scatter( {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)}); @@ -228,3 +230,5 @@ TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") { REQUIRE(output[i].toCInt() == i); } } + +#endif diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h index 82150658dcffe..09df4b2b1e441 100644 --- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h +++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h @@ -161,6 +161,7 @@ Tensor data_parallel( return module->forward(std::move(input)).to(*output_device); } +#ifdef USE_CUDA autograd::Scatter scatter(*devices, /*chunk_sizes=*/at::nullopt, dim); auto scattered_inputs = scatter.apply({std::move(input)}); @@ -169,6 +170,9 @@ Tensor data_parallel( return autograd::Gather(*output_device, dim) .apply(std::move(outputs)) .front(); +#else + AT_ERROR("data_parallel not supported without CUDA"); +#endif } } // namespace parallel diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp index 052aba90dcc4f..291a4205df16f 100644 --- a/torch/csrc/autograd/functions/comm.cpp +++ b/torch/csrc/autograd/functions/comm.cpp @@ -1,3 +1,5 @@ +#ifdef USE_CUDA + #include #include @@ -129,3 +131,5 @@ variable_list Gather::apply(variable_list&& inputs) { } // namespace autograd } // namespace torch + +#endif diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index a7c24748ca01f..b1a4b2bc3d031 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -1,5 +1,7 @@ #pragma once +#ifdef USE_CUDA + #include #include @@ -39,3 +41,5 @@ struct Gather : public Function { } // namespace autograd } // namespace torch + +#endif diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 9c3b8a305a5ae..0e869876e8e1f 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -1,5 +1,7 @@ #include +#ifdef USE_CUDA + #include #include @@ -199,3 +201,5 @@ at::Tensor gather( return result; } }} // namespace torch::cuda + +#endif diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h index caf7811afd1ae..b6d5c9b66fd75 100644 --- a/torch/csrc/cuda/comm.h +++ b/torch/csrc/cuda/comm.h @@ -1,5 +1,7 @@ #pragma once +#ifdef USE_CUDA + #include #include #include @@ -27,3 +29,5 @@ at::Tensor gather( int64_t dim, at::optional destination_index); }} + +#endif From b5c8d59451be8ae2e892afb28d0dc39661c666f1 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 20 Jul 2018 20:35:20 -0700 Subject: [PATCH 16/23] Add a CUDAContext header include Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9662 Differential Revision: D8945581 Pulled By: ezyang fbshipit-source-id: 2fe0adc96456788579f7d6f1c4513fe45360c030 --- torch/csrc/autograd/functions/comm.cpp | 1 + torch/csrc/autograd/functions/comm.h | 1 + 2 files changed, 2 insertions(+) diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp index 291a4205df16f..0af03cbfa39f5 100644 --- a/torch/csrc/autograd/functions/comm.cpp +++ b/torch/csrc/autograd/functions/comm.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index b1a4b2bc3d031..df75799a5ef84 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -6,6 +6,7 @@ #include #include +#include #include #include From 3bb8c5eab1029a7de971b7368e43be06ef97182d Mon Sep 17 00:00:00 2001 From: Owen Anderson Date: Fri, 20 Jul 2018 22:22:08 -0700 Subject: [PATCH 17/23] Allow MKLDNN on macOS, and any other OS where CMake is able to detect it. Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9638 Reviewed By: soumith Differential Revision: D8946130 Pulled By: resistor fbshipit-source-id: 87bd9cb12608467b05bd4998fdb00bfdbd038ca2 --- tools/setup_helpers/mkldnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/setup_helpers/mkldnn.py b/tools/setup_helpers/mkldnn.py index cc230e5550b45..47a10d04785c1 100644 --- a/tools/setup_helpers/mkldnn.py +++ b/tools/setup_helpers/mkldnn.py @@ -16,7 +16,7 @@ def gather_paths(env_vars): MKLDNN_LIB_DIR = None MKLDNN_INCLUDE_DIR = None MKLDNN_LIBRARY = None -if (IS_LINUX or IS_WINDOWS) and not check_env_flag('NO_MKLDNN'): +if not check_env_flag('NO_MKLDNN'): lib_paths = list(filter(bool, [ os.getenv('MKLDNN_LIB_DIR'), os.path.join(MKLDNN_HOME, 'lib'), From a01d6f01b5547a396d6e28f95538cd061e14d29a Mon Sep 17 00:00:00 2001 From: Xiaomeng Yang Date: Sat, 21 Jul 2018 12:40:40 -0700 Subject: [PATCH 18/23] Update channel_shuffle_op and transpose 2d to speed up ShuffleNet (#9525) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9525 Update channel_shuffle_op and transpose 2d to speed up ShuffleNet Reviewed By: houseroad Differential Revision: D8889361 fbshipit-source-id: 60196e819b6842becc53b4859b62d4419a0e2c6e --- .../operators/operator_fallback_ideep.cc | 2 +- caffe2/mkl/operators/operator_fallback_mkl.cc | 2 +- caffe2/operators/channel_shuffle_op.cc | 194 ++++++++++++++++-- caffe2/operators/channel_shuffle_op.h | 156 +++++--------- caffe2/operators/channel_shuffle_op_gpu.cu | 13 +- .../operator_test/channel_shuffle_test.py | 71 ++++--- .../operator_test/group_norm_op_test.py | 2 +- caffe2/utils/math_cpu.cc | 112 ++++++++-- 8 files changed, 379 insertions(+), 173 deletions(-) diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc index d222c6958067c..0d8b6fd55b205 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.cc +++ b/caffe2/ideep/operators/operator_fallback_ideep.cc @@ -43,7 +43,7 @@ struct SigmoidCPUFunctor { REGISTER_IDEEP_OPERATOR(Softmax, IDEEPFallbackOp>); REGISTER_IDEEP_OPERATOR( ChannelShuffle, - IDEEPFallbackOp>); + IDEEPFallbackOp>); REGISTER_IDEEP_OPERATOR( LabelCrossEntropy, IDEEPFallbackOp>); diff --git a/caffe2/mkl/operators/operator_fallback_mkl.cc b/caffe2/mkl/operators/operator_fallback_mkl.cc index 106fa05dec70a..56e2bc687bb68 100644 --- a/caffe2/mkl/operators/operator_fallback_mkl.cc +++ b/caffe2/mkl/operators/operator_fallback_mkl.cc @@ -76,7 +76,7 @@ REGISTER_MKL_OPERATOR( mkl::MKLFallbackOp>); REGISTER_MKL_OPERATOR( ChannelShuffle, - mkl::MKLFallbackOp>); + mkl::MKLFallbackOp>); REGISTER_MKL_OPERATOR( NCHW2NHWC, mkl::MKLFallbackOp>); diff --git a/caffe2/operators/channel_shuffle_op.cc b/caffe2/operators/channel_shuffle_op.cc index 67bdbfb3f888b..31d34b01fe2e7 100644 --- a/caffe2/operators/channel_shuffle_op.cc +++ b/caffe2/operators/channel_shuffle_op.cc @@ -1,23 +1,175 @@ #include "channel_shuffle_op.h" +#include +#include +#include + +#ifdef CAFFE2_USE_MKL +#include +#endif // CAFFE2_USE_MKL + +#include "caffe2/utils/eigen_utils.h" +#include "caffe2/utils/math.h" + namespace caffe2 { -class GetChannelShuffleGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - def_.type() + "Gradient", - "", - vector{GO(0)}, - vector{GI(0)}); +namespace { + +template +void RunChannelShuffleNCHW( + const int N, + const int G, + const int K, + const int HxW, + const T* X, + T* Y) { + const int stride = G * K * HxW; + for (int i = 0; i < N; ++i) { + if (G < K) { + EigenMatrixMap Y_mat(Y, G * HxW, K); + for (int j = 0; j < G; ++j) { + Y_mat.block(j * HxW, 0, HxW, K) = + ConstEigenMatrixMap(X + j * K * HxW, HxW, K); + } + } else { + ConstEigenMatrixMap X_mat(X, K * HxW, G); + for (int j = 0; j < K; ++j) { + EigenMatrixMap(Y + j * G * HxW, HxW, G) = + X_mat.block(j * HxW, 0, HxW, G); + } + } + X += stride; + Y += stride; } -}; +} + +template +void RunChannelShuffleNHWC( + const int N, + const int G, + const int K, + const int HxW, + const T* X, + T* Y, + CPUContext* context) { + const std::array dims = {G, K}; + const std::array axes = {1, 0}; + const int M = N * HxW; + const int C = G * K; + for (int i = 0; i < M; ++i) { + math::Transpose(2, dims.data(), axes.data(), X, Y, context); + X += C; + Y += C; + } +} + +#ifdef CAFFE2_USE_MKL + +template <> +void RunChannelShuffleNCHW( + const int N, + const int G, + const int K, + const int HxW, + const float* X, + float* Y) { + const int stride = G * K * HxW; + for (int i = 0; i < N; ++i) { + if (G < K) { + for (int j = 0; j < G; ++j) { + mkl_somatcopy( + 'R', 'N', K, HxW, 1.0f, X + j * K * HxW, HxW, Y + j * HxW, G * HxW); + } + } else { + for (int j = 0; j < K; ++j) { + mkl_somatcopy( + 'R', 'N', G, HxW, 1.0f, X + j * HxW, K * HxW, Y + j * G * HxW, HxW); + } + } + X += stride; + Y += stride; + } +} + +#endif // CAFFE2_USE_MKL + +} // namespace + +template <> +bool ChannelShuffleOp::RunOnDeviceWithOrderNCHW() { + const auto& X = Input(0); + auto* Y = Output(0); + Y->ResizeLike(X); + const int N = X.dim32(0); + const int C = X.dim32(1); + const int G = group_; + CAFFE_ENFORCE_EQ(C % G, 0); + const int K = C / G; + const int HxW = X.size() / (N * C); + const float* X_data = X.data(); + float* Y_data = Y->mutable_data(); + RunChannelShuffleNCHW(N, G, K, HxW, X_data, Y_data); + return true; +} // namespace caffe2 -REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp); +template <> +bool ChannelShuffleOp::RunOnDeviceWithOrderNHWC() { + const auto& X = Input(0); + auto* Y = Output(0); + Y->ResizeLike(X); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = X.dim32(ndim - 1); + const int G = group_; + CAFFE_ENFORCE_EQ(C % G, 0); + const int K = C / G; + const int HxW = X.size() / (N * C); + const float* X_data = X.data(); + float* Y_data = Y->mutable_data(); + RunChannelShuffleNHWC(N, G, K, HxW, X_data, Y_data, &context_); + return true; +} + +template <> +bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNCHW() { + const auto& dY = Input(0); + auto* dX = Output(0); + dX->ResizeLike(dY); + const int N = dY.dim32(0); + const int C = dY.dim32(1); + const int G = group_; + CAFFE_ENFORCE_EQ(C % G, 0); + const int K = C / G; + const int HxW = dY.size() / (N * C); + const float* dY_data = dY.data(); + float* dX_data = dX->mutable_data(); + RunChannelShuffleNCHW(N, K, G, HxW, dY_data, dX_data); + return true; +} + +template <> +bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNHWC() { + const auto& dY = Input(0); + auto* dX = Output(0); + dX->ResizeLike(dY); + const int ndim = dY.ndim(); + const int N = dY.dim32(0); + const int C = dY.dim32(ndim - 1); + const int G = group_; + CAFFE_ENFORCE_EQ(C % G, 0); + const int K = C / G; + const int HxW = dY.size() / (N * C); + const float* dY_data = dY.data(); + float* dX_data = dX->mutable_data(); + RunChannelShuffleNHWC(N, K, G, HxW, dY_data, dX_data, &context_); + return true; +} + +REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp); REGISTER_CPU_OPERATOR( ChannelShuffleGradient, - ChannelShuffleGradientOp); -REGISTER_GRADIENT(ChannelShuffle, GetChannelShuffleGradient); + ChannelShuffleGradientOp); + OPERATOR_SCHEMA(ChannelShuffle) .IdenticalTypeAndShape() .NumInputs(1) @@ -27,4 +179,22 @@ OPERATOR_SCHEMA(ChannelShuffleGradient) .IdenticalTypeAndShape() .NumInputs(1) .NumOutputs(1); + +namespace { + +class GetChannelShuffleGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + std::vector GetGradientDefs() override { + return SingleGradientDef( + "ChannelShuffleGradient", + "", + std::vector{GO(0)}, + std::vector{GI(0)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(ChannelShuffle, GetChannelShuffleGradient); + } // namespace caffe2 diff --git a/caffe2/operators/channel_shuffle_op.h b/caffe2/operators/channel_shuffle_op.h index 05e66bf5f78c6..87b4e49530345 100644 --- a/caffe2/operators/channel_shuffle_op.h +++ b/caffe2/operators/channel_shuffle_op.h @@ -1,124 +1,66 @@ -#pragma once -#include "caffe2/utils/math.h" -#include "caffe2/operators/conv_pool_op_base.h" +#ifndef CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_ +#define CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" namespace caffe2 { -template -class ChannelShuffleOp final : public ConvPoolOpBase { +template +class ChannelShuffleOp final : public Operator { public: - USE_OPERATOR_FUNCTIONS(Context); + USE_OPERATOR_CONTEXT_FUNCTIONS; + ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws) - : ConvPoolOpBase(operator_def, ws) {} - - bool RunOnDeviceWithOrderNCHW() override { - const auto& X = Input(0); - auto* Y = Output(0); - Y->ResizeLike(X); - const auto C = X.dim32(1); - const auto G = this->group_; - CAFFE_ENFORCE(C % G == 0, ""); - const auto K = C / G; - const auto S = X.dim32(2) * X.dim32(3); - for (auto n = 0; n < X.dim32(0); ++n) { - for (auto g = 0; g < G; ++g) { - // Scatter the group g block (of size KxS) to output channels - // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc. - math::CopyMatrix( - X.itemsize(), - K, - S, - X.template data() + g * K * S + n * C * S, - S, - Y->template mutable_data() + g * S + n * C * S, - G * S, - &context_, - X.meta().copy()); - } - } - return true; + : Operator(operator_def, ws), + order_(StringToStorageOrder( + OperatorBase::GetSingleArgument("order", "NCHW"))), + OP_SINGLE_ARG(int, "group", group_, 1) { + CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN); } - bool RunOnDeviceWithOrderNHWC() override { - const auto& X = Input(0); - auto* Y = Output(0); - Y->ResizeLike(X); - const auto C = X.dim32(3); - const auto G = this->group_; - CAFFE_ENFORCE(C % G == 0, ""); - const auto K = C / G; - std::array dims = {G, K}; - std::array axes = {1, 0}; - for (auto i = 0; i < X.size(); i += C) { - // Transpose each C = GxK matrix - math::Transpose( - 2, - dims.data(), - axes.data(), - X.template data() + i, - Y->template mutable_data() + i, - &context_); - } - return true; + bool RunOnDevice() override { + return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW() + : RunOnDeviceWithOrderNHWC(); } + + bool RunOnDeviceWithOrderNCHW(); + + bool RunOnDeviceWithOrderNHWC(); + + private: + const StorageOrder order_; + const int group_; }; -template -class ChannelShuffleGradientOp final : public ConvPoolOpBase { +template +class ChannelShuffleGradientOp final : public Operator { public: - USE_OPERATOR_FUNCTIONS(Context); + USE_OPERATOR_CONTEXT_FUNCTIONS; + ChannelShuffleGradientOp(const OperatorDef& operator_def, Workspace* ws) - : ConvPoolOpBase(operator_def, ws) {} - - bool RunOnDeviceWithOrderNCHW() override { - const auto& dY = Input(0); - auto* dX = Output(0); - dX->ResizeLike(dY); - const auto C = dY.dim32(1); - const auto G = this->group_; - CAFFE_ENFORCE(C % G == 0, ""); - const auto K = C / G; - const auto S = dY.dim32(2) * dY.dim32(3); - for (auto n = 0; n < dY.dim32(0); ++n) { - for (auto g = 0; g < G; ++g) { - // Gather the group g block (of size KxS) from output channels - // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc. - math::CopyMatrix( - dY.itemsize(), - K, - S, - dY.template data() + g * S + n * C * S, - G * S, - dX->template mutable_data() + g * K * S + n * C * S, - S, - &context_, - dY.meta().copy()); - } - } - return true; + : Operator(operator_def, ws), + order_(StringToStorageOrder( + OperatorBase::GetSingleArgument("order", "NCHW"))), + OP_SINGLE_ARG(int, "group", group_, 1) { + CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN); } - bool RunOnDeviceWithOrderNHWC() override { - const auto& dY = Input(0); - auto* dX = Output(0); - dX->ResizeLike(dY); - const auto C = dY.dim32(3); - const auto G = this->group_; - CAFFE_ENFORCE(C % G == 0, ""); - const auto K = C / G; - std::array dims = {K, G}; - std::array axes = {1, 0}; - for (auto i = 0; i < dY.size(); i += C) { - // Transpose each C = KxG matrix - math::Transpose( - 2, - dims.data(), - axes.data(), - dY.template data() + i, - dX->template mutable_data() + i, - &context_); - } - return true; + bool RunOnDevice() override { + return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW() + : RunOnDeviceWithOrderNHWC(); } + + bool RunOnDeviceWithOrderNCHW(); + + bool RunOnDeviceWithOrderNHWC(); + + private: + const StorageOrder order_; + const int group_; }; + } // namespace caffe2 + +#endif // CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_ diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu index 447ba55d3436f..ad9a0ab4e79d7 100644 --- a/caffe2/operators/channel_shuffle_op_gpu.cu +++ b/caffe2/operators/channel_shuffle_op_gpu.cu @@ -42,7 +42,7 @@ __global__ void ChannelShuffleNHWCKernel( } template <> -bool ChannelShuffleOp::RunOnDeviceWithOrderNCHW() { +bool ChannelShuffleOp::RunOnDeviceWithOrderNCHW() { const auto& X = Input(0); auto* Y = Output(0); Y->ResizeLike(X); @@ -61,7 +61,7 @@ bool ChannelShuffleOp::RunOnDeviceWithOrderNCHW() { } template <> -bool ChannelShuffleOp::RunOnDeviceWithOrderNHWC() { +bool ChannelShuffleOp::RunOnDeviceWithOrderNHWC() { const auto& X = Input(0); auto* Y = Output(0); Y->ResizeLike(X); @@ -79,7 +79,7 @@ bool ChannelShuffleOp::RunOnDeviceWithOrderNHWC() { } template <> -bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNCHW() { +bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNCHW() { const auto& dY = Input(0); auto* dX = Output(0); dX->ResizeLike(dY); @@ -98,7 +98,7 @@ bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNCHW() { } template <> -bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNHWC() { +bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNHWC() { const auto& dY = Input(0); auto* dX = Output(0); dX->ResizeLike(dY); @@ -115,8 +115,9 @@ bool ChannelShuffleGradientOp::RunOnDeviceWithOrderNHWC() { return true; } -REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp); +REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp); REGISTER_CUDA_OPERATOR( ChannelShuffleGradient, - ChannelShuffleGradientOp); + ChannelShuffleGradientOp); + } // namespace caffe2 diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py index 93b9328950f34..e17d7a736063c 100644 --- a/caffe2/python/operator_test/channel_shuffle_test.py +++ b/caffe2/python/operator_test/channel_shuffle_test.py @@ -11,38 +11,49 @@ class ChannelShuffleOpsTest(hu.HypothesisTestCase): - @given( - channels_per_group=st.integers(min_value=1, max_value=5), - groups=st.integers(min_value=1, max_value=5), - n=st.integers(min_value=1, max_value=2), - order=st.sampled_from(["NCHW", "NHWC"]), - **hu.gcs) - def test_channel_shuffle(self, channels_per_group, groups, n, order, gc, dc): - X = np.random.randn( - n, channels_per_group * groups, 5, 6).astype(np.float32) - if order == "NHWC": - # NCHW -> NHWC - X = X.transpose((0, 2, 3, 1)) - - op = core.CreateOperator("ChannelShuffle", ["X"], ["Y"], - group=groups, kernel=1, order=order, - device_option=gc) + def _channel_shuffle_nchw_ref(self, X, group): + dims = X.shape + N = dims[0] + C = dims[1] + G = group + K = int(C / G) + X = X.reshape(N, G, K, -1) + Y = np.transpose(X, axes=(0, 2, 1, 3)) + return [Y.reshape(dims)] + + def _channel_shuffle_nhwc_ref(self, X, group): + dims = X.shape + N = dims[0] + C = dims[-1] + G = group + K = int(C / G) + X = X.reshape(N, -1, G, K) + Y = np.transpose(X, axes=(0, 1, 3, 2)) + return [Y.reshape(dims)] + + @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5), + H=st.integers(1, 5), W=st.integers(1, 5), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) + def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc): + C = G * K + if order == "NCHW": + X = np.random.randn(N, C, H, W).astype(np.float32) + else: + X = np.random.randn(N, H, W, C).astype(np.float32) + + op = core.CreateOperator( + "ChannelShuffle", + ["X"], + ["Y"], + group=G, + order=order, + ) def channel_shuffle_ref(X): - if order == "NHWC": - # NHWC -> NCHW - X = X.transpose((0, 3, 1, 2)) - Y_r = X.reshape(X.shape[0], - groups, - X.shape[1] // groups, - X.shape[2], - X.shape[3]) - Y_trns = Y_r.transpose((0, 2, 1, 3, 4)) - Y_reshaped = Y_trns.reshape(X.shape) - if order == "NHWC": - # NCHW -> NHWC - Y_reshaped = Y_reshaped.transpose((0, 2, 3, 1)) - return (Y_reshaped,) + if order == "NCHW": + return self._channel_shuffle_nchw_ref(X, G) + else: + return self._channel_shuffle_nhwc_ref(X, G) self.assertReferenceChecks(gc, op, [X], channel_shuffle_ref) self.assertGradientChecks(gc, op, [X], 0, [0]) diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py index 3461261c2221c..caa9121e924a0 100644 --- a/caffe2/python/operator_test/group_norm_op_test.py +++ b/caffe2/python/operator_test/group_norm_op_test.py @@ -40,7 +40,7 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon): Y = gamma * (X - mu) / std + beta return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)] - @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(2, 2), + @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5), H=st.integers(2, 5), W=st.integers(2, 5), epsilon=st.floats(min_value=1e-5, max_value=1e-4), order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index 6ebf41ab7cba1..6aae82ea6554d 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -11,7 +11,6 @@ // platforms, it allows one to quickly port Caffe2 to different platforms // where BLAS may not be present. -#include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" #include @@ -30,6 +29,7 @@ #include "caffe2/core/context.h" #include "caffe2/utils/cpu_neon.h" +#include "caffe2/utils/eigen_utils.h" #include "Eigen/Core" #include "Eigen/Dense" @@ -2476,7 +2476,7 @@ namespace { #ifdef CAFFE2_USE_HPTT -bool TryTransposeWithHPTT( +bool TransposeWithHPTT( const int ndim, const int* dims, const int* axes, @@ -2511,6 +2511,62 @@ bool TryTransposeWithHPTT( #endif // CAFFE2_USE_HPTT +template +void Tranpose2D(const int rows, const int cols, const T* X, T* Y); + +#ifdef CAFFE2_USE_MKL + +#define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func) \ + template <> \ + void Tranpose2D(const int rows, const int cols, const T* X, T* Y) { \ + Func('R', 'T', rows, cols, T(1), X, cols, Y, rows); \ + } +DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy); +DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy); +#undef DELEGATE_TRANSPOSE_2D_FUNCTION + +#endif // CAFFE2_USE_MKL + +#define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T) \ + template <> \ + void Tranpose2D(const int rows, const int cols, const T* X, T* Y) { \ + EigenMatrixMap(Y, rows, cols) = \ + ConstEigenMatrixMap(X, cols, rows).transpose(); \ + } + +#ifndef CAFFE2_USE_MKL + +template <> +void Tranpose2D( + const int rows, + const int cols, + const float* X, + float* Y) { +#ifdef CAFFE2_USE_HPTT + const std::array dims = {rows, cols}; + const std::array axes = {1, 0}; + if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) { + return; + } +#endif // CAFFE2_USE_HPTT + EigenMatrixMap(Y, rows, cols) = + ConstEigenMatrixMap(X, cols, rows).transpose(); +} + +CAFFE2_SPECIALIZED_TRANSPOSE_2D(double) + +#endif // CAFFE2_USE_MKL + +CAFFE2_SPECIALIZED_TRANSPOSE_2D(int) +CAFFE2_SPECIALIZED_TRANSPOSE_2D(TIndex) +#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA +CAFFE2_SPECIALIZED_TRANSPOSE_2D(long) +#endif +CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t) +CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t) + +#undef CAFFE2_SPECIALIZED_TRANSPOSE_2D + std::vector ComputeXStrides(const int ndim, const int* dims, const int* axes) { std::vector x_strides(ndim); @@ -2527,7 +2583,7 @@ ComputeXStrides(const int ndim, const int* dims, const int* axes) { } template -void TransposeCPUImpl( +void TransposeND( const int ndim, const int* dims, const int* axes, @@ -2547,10 +2603,6 @@ void TransposeCPUImpl( const int itr_axes = ndim - num_shared_idx; const int num_blocks = std::accumulate( Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies()); - if (ndim < 2 || itr_axes == 0) { - std::memcpy(Y, X, num_blocks * block_size * sizeof(T)); - return; - } const std::vector X_strides = ComputeXStrides(itr_axes, dims, axes); std::vector index(itr_axes, 0); for (int Y_index = 0; Y_index < num_blocks; ++Y_index) { @@ -2568,24 +2620,53 @@ void TransposeCPUImpl( } } -} // namespace +template +void TransposeCPUImpl( + const int ndim, + const int* dims, + const int* axes, + const T* X, + T* Y) { + if (utils::IsIdentityPermutation(ndim, axes)) { + const int size = + std::accumulate(dims, dims + ndim, 1, std::multiplies()); + std::memcpy(Y, X, size * sizeof(T)); + return; + } + if (ndim == 2) { + Tranpose2D(dims[0], dims[1], X, Y); + } else { + TransposeND(ndim, dims, axes, X, Y); + } +} template <> -void Transpose( +void TransposeCPUImpl( const int ndim, const int* dims, const int* axes, const float* X, - float* Y, - CPUContext* /* context */) { -#ifdef CAFFE2_USE_HPTT - if (TryTransposeWithHPTT(ndim, dims, axes, X, Y)) { + float* Y) { + if (utils::IsIdentityPermutation(ndim, axes)) { + const int size = + std::accumulate(dims, dims + ndim, 1, std::multiplies()); + std::memcpy(Y, X, size * sizeof(float)); return; } -#endif // CAFFE2_USE_HPTT - TransposeCPUImpl(ndim, dims, axes, X, Y); + if (ndim == 2) { + Tranpose2D(dims[0], dims[1], X, Y); + } else { +#ifdef CAFFE2_USE_HPTT + if (TransposeWithHPTT(ndim, dims, axes, X, Y)) { + return; + } +#endif + TransposeND(ndim, dims, axes, X, Y); + } } +} // namespace + #define CAFFE2_SPECIALIZED_TRANSPOSE(T) \ template <> \ void Transpose( \ @@ -2597,6 +2678,7 @@ void Transpose( CPUContext* /* context */) { \ TransposeCPUImpl(ndim, dims, axes, X, Y); \ } +CAFFE2_SPECIALIZED_TRANSPOSE(float) CAFFE2_SPECIALIZED_TRANSPOSE(double) CAFFE2_SPECIALIZED_TRANSPOSE(int) CAFFE2_SPECIALIZED_TRANSPOSE(TIndex) From 769cb5a6405b39a0678e6bc4f2d6fea62e0d3f12 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Sat, 21 Jul 2018 21:37:30 -0700 Subject: [PATCH 19/23] Add new ways of matching nodes with schemas in the JIT (#9567) Summary: **REVIEW LAST COMMIT ONLY** As discussed in our yesterday's meeting. Nodes can be now matched to particular overloads using the `matches(...)` function: ```cpp n->matches("aten::type_as(Tensor self, Tensor other) -> Tensor") ``` This also changes the shape prop and peephole passes to use those functions for matching. This fixes a few bugs, makes them much more robust, and prepares us for removal of attributes. zdevito Pull Request resolved: https://github.com/pytorch/pytorch/pull/9567 Reviewed By: zdevito Differential Revision: D8938482 Pulled By: apaszke fbshipit-source-id: eb2382eeeae99692aada2d78d5d0c87c8ef1545e --- aten/src/ATen/ExpandUtils.cpp | 12 +- aten/src/ATen/ExpandUtils.h | 3 +- aten/src/ATen/native/TensorShape.cpp | 2 +- torch/csrc/jit/autodiff.cpp | 8 +- torch/csrc/jit/function_schema.h | 33 +- torch/csrc/jit/ir.cpp | 54 ++- torch/csrc/jit/ir.h | 15 +- torch/csrc/jit/operator.cpp | 86 +++- torch/csrc/jit/operator.h | 9 +- torch/csrc/jit/passes/peephole.cpp | 94 ++--- torch/csrc/jit/passes/shape_analysis.cpp | 505 ++++++++++++----------- 11 files changed, 476 insertions(+), 345 deletions(-) diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index f4a0ce4e53bad..f4114c6a3715c 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -29,11 +29,13 @@ std::vector infer_size(IntList a, IntList b) { } std::tuple, std::vector> inferExpandGeometry( - const Tensor& tensor, + IntList tensor_sizes, + IntList tensor_strides, IntList sizes) { int64_t ndim = sizes.size(); + int64_t tensor_dim = tensor_sizes.size(); - if (tensor.dim() == 0) { + if (tensor_dim == 0) { std::vector expandedStrides(ndim, 0); return std::tuple, std::vector>( sizes.vec(), expandedStrides); @@ -44,9 +46,9 @@ std::tuple, std::vector> inferExpandGeometry( // create a new geometry for the tensors for (int64_t i = ndim - 1; i >= 0; --i) { int64_t offset = ndim - 1 - i; - int64_t dim = tensor.dim() - 1 - offset; - int64_t size = (dim >= 0) ? tensor.sizes()[dim] : 1; - int64_t stride = (dim >= 0) ? tensor.strides()[dim] + int64_t dim = tensor_dim - 1 - offset; + int64_t size = (dim >= 0) ? tensor_sizes[dim] : 1; + int64_t stride = (dim >= 0) ? tensor_strides[dim] : expandedSizes[i + 1] * expandedStrides[i + 1]; int64_t targetSize = sizes[i]; if (targetSize == -1) { diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 2080e56a4e78b..4ed5d6254320f 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -10,7 +10,8 @@ namespace at { AT_API std::vector infer_size(IntList a, IntList b); -std::tuple, std::vector > inferExpandGeometry(const Tensor &tensor, IntList sizes); +std::tuple, std::vector > inferExpandGeometry( + IntList tensor_sizes, IntList tensor_strides, IntList sizes); // avoid copy-construction of Tensor by using a reference_wrapper. inline void check_defined(std::initializer_list> tensors, const char *api_name) { diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 13887a52633bc..db6a78cbc1652 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -126,7 +126,7 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) { std::vector expandedSizes; std::vector expandedStrides; - std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self, size); + std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self.sizes(), self.strides(), size); return self.as_strided(expandedSizes, expandedStrides); } diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index 1b7784916910e..a91532f5af15d 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -90,7 +90,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } else if (node->hasAttribute(attr::alpha)) { return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha))}; } else { - return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)}; + return {grads.at(0), nullptr, grads.at(0) * node->input(attr::alpha)}; } case aten::sub: // o = self - alpha*other @@ -99,7 +99,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } else if (node->hasAttribute(attr::alpha)) { return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))}; } else { - return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)}; + return {grads.at(0), nullptr, grads.at(0) * node->input(attr::alpha)}; } case aten::mul: // o = self * other @@ -119,7 +119,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val return {grads.at(0) * (outputs.at(0))}; case aten::chunk: case aten::split: - return {SymbolicVariable::cat(grads, node->getValue(attr::dim))}; + return {SymbolicVariable::cat(grads, node->input(attr::dim))}; case aten::t: return {grads.at(0).t()}; case aten::neg: @@ -130,7 +130,7 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val case aten::type_as: return {grads.at(0).type_as(inputs.at(0))}; case aten::unsqueeze: - return {grads.at(0).squeeze(node->getValue(attr::dim))}; + return {grads.at(0).squeeze(node->input(attr::dim))}; case aten::mm: { SymbolicVariable dmat1, dmat2; if (auto type = inputs.at(0).value()->type()->cast()) { diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h index 99222e88af25f..ec56f6144bfca 100644 --- a/torch/csrc/jit/function_schema.h +++ b/torch/csrc/jit/function_schema.h @@ -85,21 +85,30 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) { inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) { // eventually this should look almost identical to python arg parser, but // it is simpler for now to work directly on this schema - auto emitList = [&](const std::vector& args) { + + out << schema.name; + out << "("; + + bool seen_kwarg_only = false; + for(size_t i = 0; i < schema.arguments.size(); ++i) { + if (i > 0) out << ", "; + if (schema.arguments[i].kwarg_only && !seen_kwarg_only) { + out << "*, "; + seen_kwarg_only = true; + } + out << schema.arguments[i]; + } + + out << ") -> "; + if (schema.returns.size() == 1) { + out << schema.returns.at(0).type->str(); + } else if (schema.returns.size() > 1) { out << "("; - for(size_t i = 0; i < args.size(); ++i) { - if(i > 0) - out << ", "; - out << args[i]; + for (size_t i = 0; i < schema.returns.size(); ++i) { + if (i > 0) out << ", "; + out << schema.returns[i].type->str(); } out << ")"; - }; - - out << schema.name; - emitList(schema.arguments); - if(schema.returns.size() > 1) { - out << " -> "; - emitList(schema.returns); } return out; } diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 39de66680b9ca..b68cec65cbf20 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -642,6 +642,10 @@ at::optional Node::get(Symbol name) { return getattr()(this, name); } auto inp = findInput(name); + const Argument & arg = inp.second; + if (!inp.first) { + return tensor_as(arg.default_value.value()); + } Node *producer = inp.first->node(); if (producer->kind() != prim::Constant) return at::nullopt; auto value = producer->t(attr::value); @@ -655,25 +659,30 @@ template at::optional> Node::get(Symbol name); at::optional Node::get(Symbol name) { // TODO (apaszke): remove once tracer and compiler stop emitting attributes - if (hasAttributes()) { - throw std::runtime_error("IValue Node::get() not implemented for the attribute case"); + if (hasAttribute(name)) { + switch (kindOf(name)) { + case AttributeKind::i: + return IValue{as_tensor(i(name))}; + case AttributeKind::t: + return IValue{as_tensor(t(name))}; + case AttributeKind::is: + return IValue{as_tensor(is(name))}; + default: + throw std::runtime_error("get() NYI"); + } } auto inp = findInput(name); + const Argument & arg = inp.second; + if (!inp.first) { + return IValue{arg.default_value.value()}; + } Node * producer = inp.first->node(); if (producer->kind() != prim::Constant) return at::nullopt; auto value = producer->t(attr::value); - const Argument & arg = inp.second; - if (arg.type->isSubtypeOf(*DynamicType::get())) { - return IValue{std::move(value)}; - } else if (arg.type->isSubtypeOf(*IntType::get())) { - return IValue{tensor_as(std::move(value))}; - } else if (arg.type->isSubtypeOf(*FloatType::get())) { - return IValue{tensor_as(std::move(value))}; - } - throw std::runtime_error("Unsupported case in Node::get! File a bug report."); + return IValue{std::move(value)}; } -Value* Node::getValue(Symbol name) { +Value* Node::input(Symbol name) { // TODO (apaszke): remove once tracer and compiler stop emitting attributes if (hasAttribute(name)) { switch (kindOf(name)) { @@ -687,9 +696,13 @@ Value* Node::getValue(Symbol name) { throw std::runtime_error("getValue() NYI"); } } - return findInput(name).first; + auto inp = findInput(name); + if (inp.first) return inp.first; + return owningGraph()->insertConstant(inp.second.default_value.value()); } +// XXX: the first coordinate can be a nullptr, which means that you should use +// the default value for this arg, because it's optional and missing std::pair Node::findInput(Symbol name) { if (!schema_) { findSchema(); @@ -700,13 +713,26 @@ std::pair Node::findInput(Symbol name) { const auto & arg = schema_->arguments[i]; if (hasAttributeS(arg.name)) continue; if (arg.name == name_str) { - return std::pair(input(input_i), arg); + if (input_i < inputs().size()) { + return std::pair(input(input_i), arg); + } else { + JIT_ASSERT(arg.default_value); + return std::pair(nullptr, arg); + } } input_i++; } throw std::runtime_error(std::string("Couldn't find an argument called ") + name.toQualString()); } +bool Node::matches(const char *signature_literal, at::ArrayRef const_inputs) { + if (!sig(signature_literal).matches(this)) return false; + for (Symbol s : const_inputs) { + if (!is_constant(s)) return false; + } + return true; +} + void Node::findSchema() { schema_ = &getOperatorFor(this).schema; } diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 3595b83aefda6..153828497d1bf 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -398,7 +398,12 @@ struct Node : public Attributes { template at::optional get(Symbol name); at::optional get(Symbol name); - Value* getValue(Symbol name); + Value* input(Symbol name); + + // Returns true if the value of input name is statically known + bool is_constant(Symbol name) { + return static_cast(get(name)); + } // Graphs @@ -657,6 +662,14 @@ struct Node : public Attributes { return static_cast(this); } + // XXX: this function is meant to be used with string literals only! + bool matches(const char *signature_literal, at::ArrayRef const_inputs={}); + + const FunctionSchema& schema() { + if (!schema_) findSchema(); + return *schema_; + } + virtual ~Node() {} private: std::pair findInput(Symbol name); diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index 90c43fe7145dd..652eb90bf3797 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -223,15 +223,80 @@ struct SchemaParser { namespace { +std::string canonicalSchemaString(const FunctionSchema& schema) { + std::ostringstream out; + + out << schema.name; + out << "("; + + bool seen_kwarg_only = false; + for(size_t i = 0; i < schema.arguments.size(); ++i) { + if (i > 0) out << ", "; + if (schema.arguments[i].kwarg_only && !seen_kwarg_only) { + out << "*, "; + seen_kwarg_only = true; + } + const auto & arg = schema.arguments[i]; + out << arg.type->str() << " " << arg.name; + } + + out << ") -> "; + if (schema.returns.size() == 1) { + out << schema.returns.at(0).type->str(); + } else if (schema.returns.size() > 1) { + out << "("; + for (size_t i = 0; i < schema.returns.size(); ++i) { + if (i > 0) out << ", "; + out << schema.returns[i].type->str(); + } + out << ")"; + } + return out.str(); +} + using OperatorMap = std::unordered_map>>; struct OperatorRegistry { OperatorMap operators; std::mutex lock; + // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls. + // Basically, every function schema is assigned a unique string you can use to match it. However, + // parsing those strings or comparing and hashing them character by character would be very slow, so + // we use a trick here! Every string literal in your program is guaranteed to have static storage + // duration and so its address won't change at runtime. This allows us to memoize answerts for every + // pointer, which is done by the operators_by_sig_literal map. Still, this map is initially + // empty, and so we still need to do the complete string matching at the first time, which is implemented + // by performing a lookup in the operators_by_sig map. + std::unordered_map> operators_by_sig; + std::unordered_map> operators_by_sig_literal; void registerOperator(Operator&& op){ std::lock_guard guard(lock); + Symbol sym = Symbol::fromQualString(op.schema.name); - operators[sym].push_back(std::make_shared(std::move(op))); + auto op_ptr = std::make_shared(std::move(op)); + + operators[sym].push_back(op_ptr); + + operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr; } + + Operator& lookupByLiteral(const char * name) { + auto it = operators_by_sig_literal.find(name); + if (it == operators_by_sig_literal.end()) { + auto op_ptr_it = operators_by_sig.find(name); + // Handy debugging code that dumps all operators we know about on mismatch +#if 0 + if (op_ptr_it == operators_by_sig.end()) { + for (auto & entry : operators_by_sig) { + std::cout << entry.first << std::endl; + } + } +#endif + JIT_ASSERTM(op_ptr_it != operators_by_sig.end(), "Couldn't find an operator for %s", name); + it = operators_by_sig_literal.emplace_hint(it, name, op_ptr_it->second); + } + return *it->second; + } + const std::vector>& getOperators(Symbol name) { std::lock_guard guard(lock); static std::vector> empty; @@ -242,19 +307,23 @@ struct OperatorRegistry { } }; -OperatorRegistry& getRegsitry() { +OperatorRegistry& getRegistry() { static OperatorRegistry r; return r; } -} +} // anonymous namespace void registerOperator(Operator&& op) { - getRegsitry().registerOperator(std::move(op)); + getRegistry().registerOperator(std::move(op)); } const std::vector>& getAllOperatorsFor(Symbol name) { - return getRegsitry().getOperators(name); + return getRegistry().getOperators(name); +} + +Operator& sig(const char *signature) { + return getRegistry().lookupByLiteral(signature); } FunctionSchema parseSchema(const std::string& schema) { @@ -293,7 +362,10 @@ bool typeMatches(TypePtr actual, TypePtr formal) { return false; } -bool Operator::matchesNode(Node* node) const { +bool Operator::matches(Node* node) const { + if (node->kind().toQualString() != schema.name) { + return false; + } size_t attributes_size = node->numAttributes(); size_t attributes_seen = 0; auto inputs_size = node->inputs().size(); @@ -351,7 +423,7 @@ bool Operator::matchesNode(Node* node) const { std::shared_ptr findOperatorFor(Node* node) { const auto& candidates = getAllOperatorsFor(node->kind()); for(const auto& candidate : candidates) { - if(candidate->matchesNode(node)) { + if(candidate->matches(node)) { return candidate; } } diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h index 9db66cd4c1f7d..50ec147090be3 100644 --- a/torch/csrc/jit/operator.h +++ b/torch/csrc/jit/operator.h @@ -32,11 +32,11 @@ struct Operator { FunctionSchema schema; - bool matchesNode(Node* n) const; + bool matches(Node* n) const; // Operators have different versions depending on if some inputs are encoded // as attributes or inputs. This function returns the right Operation function, // given a node encoded for one variant. - // Behavior is undefined if matchesNode(n) == false + // Behavior is undefined if matches(n) == false Operation selectVariant(Node* n) const { if(n->hasAttributes()) { JIT_ASSERT(op_const_attributes != nullptr); @@ -55,13 +55,16 @@ std::shared_ptr findOperatorFor(Node* node); const Operator& getOperatorFor(Node* node); inline Operation getOperation(Node* node) { - // note: getOperatorFor ensures that getOperatorFor(node).matchesNode(node) == true + // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true // so the call to selectVariant is always valid. return getOperatorFor(node).selectVariant(node); } void registerOperator(Operator&& op); +// XXX: this function is meant to be used with string literals only! +Operator& sig(const char *signature_literal); + struct RegisterOperators { RegisterOperators(std::vector operators) { for(Operator& o : operators) { diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp index 5fd438280a35e..feebbcf2fd505 100644 --- a/torch/csrc/jit/passes/peephole.cpp +++ b/torch/csrc/jit/passes/peephole.cpp @@ -16,67 +16,59 @@ namespace torch { namespace jit { // TODO: Decide what kind of fixed point strategy we will have void PeepholeOptimize(Block * block) { for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) { - auto* n = *it; + auto* node = *it; - for (Block * sub_block : n->blocks()) { + for (Block * sub_block : node->blocks()) { PeepholeOptimize(sub_block); } // XXX: remember that if you want to simplify an expression by combining multiple nodes // into a different one, then you need to check that they all belong to the given block - switch (n->kind()) { - case aten::expand: { - // Eliminate redundant expand - if (!n->input()->isTensor()) break; - // the sizes are dynamic - if(n->inputs().size() != 1) break; - if (n->get>(attr::size) == n->input()->type()->expect()->sizes()) { - n->output()->replaceAllUsesWith(n->input()); - // Let DCE clean up any unused nodes at this point + if (node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor", + /*with_const=*/attr::size)) { + // x.expand(x.size()) == x + if (auto input_type = node->input(attr::self)->type()->cast()) { + auto expanded_sizes = node->get>(attr::size); + if (expanded_sizes == input_type->sizes()) { + node->output()->replaceAllUsesWith(node->input()); } - } break; - case aten::t: { - // x.t().t() == x - auto input_node = n->input()->node(); - if (input_node->kind() == aten::t) { - n->output()->replaceAllUsesWith(input_node->input()); - // Let DCE clean up any unused nodes at this point - } - } break; - case aten::type_as: { - JIT_ASSERT(n->inputs().size() == 2); - Value *lhs = n->input(0); - Value *rhs = n->input(1); - // If LHS and RHS have the same static type, remove the type_as operator. - if (lhs->type()->kind() == TypeKind::TensorType && - rhs->type()->kind() == TypeKind::TensorType) { - auto ltype = (*lhs->type()).cast(); - auto rtype = (*rhs->type()).cast(); - if(ltype->device() == rtype->device() && - ltype->scalarType() == rtype->scalarType()) { - n->output()->replaceAllUsesWith(lhs); - } - } - } break; - case aten::add: { - // mm + add == addmm - if (n->inputs().size() == 2 && - n->get(attr::alpha) && - tensor_as(*n->get(attr::alpha)) == 1. && - n->input(1)->node()->kind() == aten::mm) { - WithInsertPoint guard(n); + } + } else if (node->matches("aten::t(Tensor self) -> Tensor")) { + // x.t().t() == x + Node *input_node = node->input()->node(); + if (input_node->matches("aten::t(Tensor self) -> Tensor")) { + node->output()->replaceAllUsesWith(input_node->input()); + } + } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) { + // x.type_as(y) == x iff x.type() == y.type() + auto self_type = node->input(0)->type()->cast(); + auto other_type = node->input(1)->type()->cast(); + if (self_type && other_type && + self_type->scalarType() == other_type->scalarType() && + self_type->device() == other_type->device()) { + node->output()->replaceAllUsesWith(node->input(0)); + } + } else if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", + /*with_const=*/attr::alpha)) { + // z + x.mm(y) == z.addmm(x, y) == x.mm(y) + z + if (tensor_as(node->get(attr::alpha).value()) == 1.) { + // Look for mm from both sides of the add + for (size_t mm_side = 0; mm_side < 2; mm_side++) { + if (node->input(mm_side)->node()->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) { + WithInsertPoint guard(node); - auto input_node = n->input(1)->node(); - SymbolicVariable mat(n->input(0)); - SymbolicVariable mat1(input_node->input(0)); - SymbolicVariable mat2(input_node->input(1)); - SymbolicVariable addmm_value = mat.addmm(mat1, mat2); + auto mm_node = node->input(mm_side)->node(); + SymbolicVariable add_mat(node->input(1 - mm_side)); + SymbolicVariable mat1(mm_node->input(0)); + SymbolicVariable mat2(mm_node->input(1)); + SymbolicVariable addmm_value = add_mat.addmm(mat1, mat2); - // Copy shape information from output node - ((Value*)addmm_value)->copyMetadata(n->output()); - n->output()->replaceAllUsesWith(addmm_value); + // Copy shape information from output node + ((Value*)addmm_value)->copyMetadata(node->output()); + node->output()->replaceAllUsesWith(addmm_value); + } } - } break; + } } } } diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 5fe1358a0010b..1775a57326fc5 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -20,12 +20,20 @@ struct propagation_error : std::exception {}; #define SHAPE_ASSERT(cond) if (!(cond)) throw propagation_error() namespace { + void setDynamicType(Node * node) { for(auto o : node->outputs()) { o->setType(DynamicType::get()); } } +int64_t wrapDim(int64_t dim, at::IntList sizes) { + if (dim < 0) { + dim += sizes.size(); + } + return dim; +} + at::Tensor representativeTensor(const TensorType * type) { auto backend = type->device() == -1 ? at::kCPU : at::kCUDA; at::DeviceGuard device_guard(type->device()); @@ -35,16 +43,39 @@ at::Tensor representativeTensor(const TensorType * type) { void PropagateShapeOnBlock(Block * block, bool insert_expands=true); -std::pair, bool> gatherTypes(at::ArrayRef values) { - std::vector types; - bool present = true; - for(auto v : values) { - TensorType* type = v->type()->cast(); - if(!type) - present = false; - types.push_back(type); +at::optional> gatherTensorTypes(Node *node) { + std::vector tensor_types; + tensor_types.reserve(node->inputs().size()); + // TODO (apaszke): Remove once we stop using attributes + // XXX: we also make the exception for cat, because we need shape prop to work for it + // (we have tests). We'll have to remove the special case once we stop flattening lists into inputs. + if (node->hasAttributes() || node->kind() == aten::cat) { + std::vector inputs = node->inputs(); + if (node->kind() == aten::cat && inputs.back()->type()->isSubtypeOf(*IntType::get())) { + inputs.pop_back(); + } + for (Value *v : inputs) { + TensorType* type = v->type()->cast(); + if(!type) return at::nullopt; + tensor_types.push_back(type); + } + } else { + auto & schema = node->schema(); + auto & args = schema.arguments; + // XXX: This gets triggered for nodes that have Tensor[] as arguments. + // Those are currently very annoying to handle, because the lists are simply + // inlined into the node inputs, so we bail out from shape propagation for now. + if (schema.is_vararg || args.size() != node->inputs().size()) { + return at::nullopt; + } + for (size_t i = 0; i < node->inputs().size(); ++i) { + if (!args[i].type->isSubtypeOf(*DynamicType::get())) continue; + TensorType *type = node->input(i)->type()->cast(); + if (!type) return at::nullopt; + tensor_types.push_back(type); + } } - return std::make_pair(std::move(types), present); + return tensor_types; } bool mergeTypes(ArrayRef lhs, ArrayRef rhs, ArrayRef outputs) { @@ -65,9 +96,8 @@ bool mergeTypes(ArrayRef lhs, ArrayRef rhs, ArrayRef out void PropagateShapeOnNode(Node * node, bool insert_expands=true); -void broadcastPointwise(Node *node, std::vector& types) { - JIT_ASSERT(types.size() == 2); - auto expected_size = at::infer_size(types[0]->sizes(), types[1]->sizes()); +void broadcastBinary(Node *node, std::vector& types, size_t idx1, size_t idx2) { + auto expected_size = at::infer_size(types[idx1]->sizes(), types[idx2]->sizes()); auto broadcast = [&](size_t input_idx) { TensorType* input_type = types.at(input_idx); if (input_type->sizes() == expected_size) @@ -80,18 +110,42 @@ void broadcastPointwise(Node *node, std::vector& types) { PropagateShapeOnNode(expand); node->replaceInput(input_idx, expand->output()); }; - broadcast(0); - broadcast(1); - types[0] = node->inputs().at(0)->type()->expect(); - types[1] = node->inputs().at(1)->type()->expect(); + broadcast(idx1); + broadcast(idx2); + types[0] = node->inputs().at(idx1)->type()->expect(); + types[1] = node->inputs().at(idx2)->type()->expect(); } void PropagateShapeOnNodeByRunningIt(Node* node, const std::vector& types) { auto op = getOperation(node); Stack stack; - for(auto & type : types) { - stack.push_back(representativeTensor(type)); + size_t types_i = 0; + // TODO (apaszke): remove once we stop using attributes + if (node->hasAttributes()) { + for (auto & type : types) { + stack.push_back(representativeTensor(type)); + } + // TODO (apaszke): remove once aten::cat is saner (see first XXX in gatherTensorTypes) + } else if (node->kind() == aten::cat) { + for (auto & type : types) { + stack.push_back(representativeTensor(type)); + } + stack.push_back(node->get(attr::dim).value()); + } else { + JIT_ASSERT(node->schema().arguments.size() == node->inputs().size()); + for (const auto & arg : node->schema().arguments) { + if (arg.type->isSubtypeOf(*DynamicType::get())) { + stack.emplace_back(representativeTensor(types[types_i++])); + } else { + auto maybe_val = node->get(Symbol::attr(arg.name)); + if (!maybe_val) { + setDynamicType(node); + return; + } + stack.push_back(std::move(*maybe_val)); + } + } } // XXX: we're not catching any exceptions from the op for now. This @@ -101,15 +155,13 @@ void PropagateShapeOnNodeByRunningIt(Node* node, const std::vector& op(stack); JIT_ASSERT(stack.size() == node->outputs().size()); - for(size_t i = 0; i < stack.size(); ++i) { + for (size_t i = 0; i < stack.size(); ++i) { node->outputs()[i]->inferTypeFrom(stack[i].toTensor()); } } void PropagateShapeOnNode(Node * node, bool insert_expands) { - using AKind = AttributeKind; - // These don't require the types and present flag. Return early after we - // process them + // These don't require the types, and have complicated schema. Return early after we process them. switch(node->kind()) { case prim::If: { auto then_block = node->blocks().at(0); @@ -146,241 +198,202 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { } return; } - default: ; // fall-through + case prim::NumToTensor: + case prim::TensorToNum: { + node->output()->setType(node->inputs()[0]->type()); + return; + } + case prim::Constant: { + node->output()->inferTypeFrom(node->t(attr::value)); + return; + } + case prim::PythonOp: + case prim::Print: + case prim::Undefined: { + setDynamicType(node); + return; + } + default: + break; // fall-through } - std::vector types; - bool present; - // TODO (apaszke): This currently assumes that inputs are pretty much tensor-only. Need to fix that. - std::tie(types, present) = gatherTypes(node->inputs()); - if(!present) { + + auto maybe_tensor_types = gatherTensorTypes(node); + if (!maybe_tensor_types) { return setDynamicType(node); } + auto & tensor_types = *maybe_tensor_types; - bool handled = false; - // XXX: real attributes of node can be a superset of attrs - // XXX: if this returns true then you are obliged to set the types - auto check_overload = [&](size_t num_inputs, size_t num_outputs, - std::vector> attrs) { - JIT_ASSERT(!handled); - if (node->inputs().size() != num_inputs) return false; - if (node->outputs().size() != num_outputs) return false; - for (auto & attr : attrs) { - if (!node->hasAttribute(attr.second)) return false; - if (node->kindOf(attr.second) != attr.first) return false; - } - handled = true; - return true; - }; - - // TODO (apaszke): Those currently assume that a lot of stuff is constant, which might - // not be the case if they are produced by script. - switch(node->kind()) { - // For expensive ops we can directly encode their shape propagation - // here, otherwise we fallback to running a fake version of the op - // to get a quick and dirty propagation. - case aten::add: - case aten::sub: - case aten::mul: - case aten::div: - case aten::pow: - case aten::min: - case aten::max: - case aten::lt: - case aten::le: - case aten::gt: - case aten::ge: - case aten::eq: - case aten::ne: { - if (node->inputs().size() == 2 && insert_expands) { - broadcastPointwise(node, types); - } - // NB: we don't handle the nodes in any other way, because the type casting - // logic in scalar cases is non-trivial. It's better to just run them. - } break; - case aten::neg: { - if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break; - node->output()->setType(types.at(0)->contiguous()); - } break; - case aten::mm: { - if (!check_overload(/*num_inputs=*/2, /*num_outputs=*/1, {})) break; - auto lhs_type = types.at(0); - auto rhs_type = types.at(1); - SHAPE_ASSERT(lhs_type->sizes().size() == 2 && rhs_type->sizes().size() == 2); - node->output()->setType(std::make_shared( - lhs_type->scalarType(), lhs_type->device(), - at::IntList{lhs_type->sizes().at(0), rhs_type->sizes().at(1)})); - } break; - case aten::t: { - if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break; - auto tp = types.at(0); - auto sizes = tp->sizes(); - auto strides = tp->strides(); - SHAPE_ASSERT(sizes.size() == 2); - std::swap(sizes.at(0), sizes.at(1)); - std::swap(strides.at(0), strides.at(1)); - node->output()->setType(tp->withSizesStrides(sizes, strides)); - } break; - case aten::narrow: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::i, attr::dim}, - {AKind::i, attr::length}})) { - auto tp = types.at(0); - auto sizes = tp->sizes(); - int64_t dim = node->i(attr::dim); - int64_t length = node->i(attr::length); - SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); - sizes.at(dim) = length; - node->output()->setType(tp->withSizesStrides(sizes, tp->strides())); - } - } break; - case aten::sum: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::is, attr::dim}, - {AKind::i, attr::keepdim}})) { - auto tp = types.at(0); - auto sizes = tp->sizes(); - auto dims = node->get>(attr::dim).value(); - bool keepdim = node->get(attr::keepdim).value(); - std::reverse(dims.begin(), dims.end()); - for (int64_t dim : dims) { - SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); - if (keepdim) { - sizes.at(dim) = 1; - } else { - sizes.erase(sizes.begin() + dim); - } - } - node->output()->setType(tp->withSizes(sizes)); - } else if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) { - node->output()->setType(types.at(0)->withSizes({})); - } - } break; - case aten::squeeze: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::i, attr::dim}})) { - auto tp = types.at(0); - auto sizes = tp->sizes(); - auto strides = tp->strides(); - int64_t dim = node->get(attr::dim).value(); - SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); - if (sizes.at(dim) == 1) { - sizes.erase(sizes.begin() + dim); - strides.erase(strides.begin() + dim); - } - node->output()->setType(tp->withSizesStrides(sizes, strides)); + // For expensive ops we can directly encode their shape propagation + // here, otherwise we fallback to running a fake version of the op + // to get a quick and dirty propagation. + if (insert_expands && ( + node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") || + node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") || + node->matches("aten::mul(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::div(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::pow(Tensor self, Tensor exponent) -> Tensor") || + node->matches("aten::min(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::max(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") || + node->matches("aten::ne(Tensor self, Tensor other) -> Tensor"))) { + // Binary broadcasting ops + // NB: we don't handle the nodes in any other way (note the lack of return!), + // because the type casting logic in scalar cases is non-trivial. + // It's better to just run them. + broadcastBinary(node, tensor_types, 0, 1); + } else if (node->matches("aten::neg(Tensor self) -> Tensor")) { + node->output()->setType(tensor_types.at(0)->contiguous()); + return; + } else if (node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) { + auto lhs_type = tensor_types.at(0); + auto rhs_type = tensor_types.at(1); + SHAPE_ASSERT(lhs_type->sizes().size() == 2 && rhs_type->sizes().size() == 2); + node->output()->setType(std::make_shared( + lhs_type->scalarType(), lhs_type->device(), + at::IntList{lhs_type->sizes().at(0), rhs_type->sizes().at(1)})); + return; + } else if (node->matches("aten::t(Tensor self) -> Tensor")) { + auto tp = tensor_types.at(0); + auto sizes = tp->sizes(); + auto strides = tp->strides(); + SHAPE_ASSERT(sizes.size() == 2); + std::swap(sizes.at(0), sizes.at(1)); + std::swap(strides.at(0), strides.at(1)); + node->output()->setType(tp->withSizesStrides(sizes, strides)); + return; + } else if (node->matches("aten::narrow(Tensor self, int dim, int start, int length) -> Tensor", + /*with_const=*/{attr::dim, attr::length})) { + auto tp = tensor_types.at(0); + auto sizes = tp->sizes(); + int64_t dim = node->get(attr::dim).value(); + int64_t length = node->get(attr::length).value(); + SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); + sizes.at(dim) = length; + node->output()->setType(tp->withSizesStrides(sizes, tp->strides())); + return; + } else if (node->matches("aten::sum(Tensor self) -> Tensor")) { + node->output()->setType(tensor_types.at(0)->withSizes({})); + return; + } else if (node->matches("aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor", + /*with_const=*/{attr::dim, attr::keepdim})) { + auto & tp = tensor_types.at(0); + auto sizes = tp->sizes(); + auto dims = node->get>(attr::dim).value(); + bool keepdim = node->get(attr::keepdim).value(); + std::reverse(dims.begin(), dims.end()); + for (int64_t dim : dims) { + SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); + if (keepdim) { + sizes.at(dim) = 1; + } else { + sizes.erase(sizes.begin() + dim); } - } break; - case aten::unsqueeze: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::i, attr::dim}})) { - auto tp = types.at(0); - auto sizes = tp->sizes(); - auto strides = tp->strides(); - int64_t dim = node->get(attr::dim).value(); - SHAPE_ASSERT(dim >= 0 && static_cast(dim) <= sizes.size()); - sizes.insert(sizes.begin() + dim, 1); - strides.insert(strides.begin() + dim, 1); - node->output()->setType(tp->withSizesStrides(sizes, strides)); + } + node->output()->setType(tp->withSizes(sizes)); + return; + } else if (node->matches("aten::squeeze(Tensor self, int dim) -> Tensor", /*with_const=*/attr::dim)) { + auto & tp = tensor_types.at(0); + auto sizes = tp->sizes(); + auto strides = tp->strides(); + int64_t dim = wrapDim(node->get(attr::dim).value(), sizes); + SHAPE_ASSERT(dim >= 0 && static_cast(dim) < sizes.size()); + if (sizes.at(dim) == 1) { + sizes.erase(sizes.begin() + dim); + strides.erase(strides.begin() + dim); + } + node->output()->setType(tp->withSizesStrides(sizes, strides)); + return; + } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor", /*with_const=*/attr::dim)) { + auto & tp = tensor_types.at(0); + auto sizes = tp->sizes(); + auto strides = tp->strides(); + int64_t dim = wrapDim(node->get(attr::dim).value(), sizes); + SHAPE_ASSERT(dim >= 0 && static_cast(dim) <= sizes.size()); + int64_t new_stride = dim >= static_cast(sizes.size()) ? 1 : sizes.at(dim) * strides.at(dim); + sizes.insert(sizes.begin() + dim, 1); + strides.insert(strides.begin() + dim, new_stride); + node->output()->setType(tp->withSizesStrides(sizes, strides)); + return; + } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor", /*with_const=*/attr::size)) { + auto sizes = node->get>(attr::size).value(); + bool inferred = false; + size_t inferred_idx; + int64_t size_product = 1; + for (size_t i = 0; i < sizes.size(); ++i) { + if (sizes[i] == -1) { + if (inferred) throw propagation_error(); + inferred = true; + inferred_idx = i; + } else { + size_product *= sizes[i]; } - } break; - case aten::view: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::is, attr::size}})) { - auto sizes = node->get>(attr::size).value(); - bool inferred = false; - size_t inferred_idx; - int64_t size_product = 1; - for (size_t i=0; ioutput()->setType(types.at(0)->withSizes(sizes)); - } - } break; - case aten::expand: { - if(check_overload(/*num_inputs=*/1, /*num_outputs=*/1, - {{AKind::is, attr::size}})) { - // it is safe to run this, even if we have an integer input tensor - PropagateShapeOnNodeByRunningIt(node, types); - } - } break; - case aten::index_select: { - if(check_overload(/*num_inputs=*/2, /*num_outputs=*/1, - {{AKind::i, attr::dim}})) { - auto ten = types.at(0); - auto index = types.at(1); - int64_t dim = node->get(attr::dim).value(); - SHAPE_ASSERT(index->sizes().size() == 1); - SHAPE_ASSERT(dim >= 0 && static_cast(dim) < ten->sizes().size()); - std::vector sizes = ten->sizes(); - sizes[dim] = index->sizes()[0]; - node->output()->setType(ten->withSizes(sizes)); - } - } break; - case prim::Constant: { - node->output()->inferTypeFrom(node->t(attr::value)); - handled = true; - } break; - case prim::TensorToNum: - case prim::NumToTensor: { - node->output()->setType(node->inputs()[0]->type()); - handled = true; - } break; - case prim::Undefined: { - node->output()->setType(DynamicType::get()); - handled = true; - } break; - case prim::PythonOp: { - setDynamicType(node); - handled = true; - } break; - case prim::Print: { - setDynamicType(node); - handled = true; - } break; - case onnx::Shape: { - if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) { - std::vector dim_vec = {(int64_t)types.at(0)->sizes().size()}; - at::IntList dims(dim_vec); - node->output()->setType( - std::make_shared(at::kLong, -1, dims)); - } - } break; - case onnx::Reshape: { - setDynamicType(node); - handled = true; + if (inferred) { + SHAPE_ASSERT(size_product != 0); + size_t numel = 1; + for (int64_t s : tensor_types.at(0)->sizes()) + numel *= s; + int64_t inferred_size = numel / size_product; + sizes[inferred_idx] = inferred_size; + } + node->output()->setType(tensor_types.at(0)->withSizes(sizes)); + return; + } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) { + if (tensor_types.at(0)->scalarType() == tensor_types.at(1)->scalarType()) { + node->output()->setType(node->input(attr::self)->type()); + } else { + // This will be a copy, so the result will be contiguous + node->output()->setType(tensor_types.at(1)->withSizes(tensor_types.at(0)->sizes())); } - default: { - } break; + return; + } else if (node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor", + /*with_const=*/attr::size)) { + auto tp = tensor_types.at(0); + std::vector sizes, strides; + std::tie(sizes, strides) = at::inferExpandGeometry( + tp->sizes(), tp->strides(), node->get>(attr::size).value()); + node->output()->setType(tp->withSizesStrides(sizes, strides)); + return; + } else if (node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", + /*with_const=*/attr::dim)) { + auto ten = tensor_types.at(0); + auto index = tensor_types.at(1); + int64_t dim = node->get(attr::dim).value(); + SHAPE_ASSERT(index->sizes().size() == 1); + SHAPE_ASSERT(dim >= 0 && static_cast(dim) < ten->sizes().size()); + std::vector sizes = ten->sizes(); + sizes[dim] = index->sizes()[0]; + node->output()->setType(ten->withSizes(sizes)); + return; + } else if (node->kind() == onnx::Shape) { + SHAPE_ASSERT(node->inputs().size() == 1 && node->outputs().size() == 1); + std::vector dim_vec = {(int64_t)tensor_types.at(0)->sizes().size()}; + at::IntList dims(dim_vec); + node->output()->setType( + std::make_shared(at::kLong, -1, dims)); + return; + } else if (node->kind() == onnx::Reshape) { + setDynamicType(node); + return; } - // If we haven't manage to handle the op so far, we fall back to inferring the + // If we haven't managed to handle the op so far, we fall back to inferring the // shapes by doing an example run of the op (if we can). - if (!handled) { - // Integral typed inputs are often an indicator that we're indexing into - // a tensor, so we should special-case these ops in the shape propagation. - // Additionally, passing in a zero representative tensor into an integer - // division op causes divide-by-zero errors - bool shape_inferenceable = !std::any_of(types.begin(), types.end(), [](TensorType* t){ - return at::isIntegralType(t->scalarType()); - }); - if (node->kind() == aten::type_as || shape_inferenceable ) { - PropagateShapeOnNodeByRunningIt(node, types); - } else { - setDynamicType(node); - } + // Integral typed inputs are often an indicator that we're indexing into + // a tensor, so we should special-case these ops in the shape propagation. + // Additionally, passing in a zero representative tensor into an integer + // division op causes divide-by-zero errors + bool shape_inferenceable = !std::any_of(tensor_types.begin(), tensor_types.end(), [](TensorType* t){ + return at::isIntegralType(t->scalarType()); + }); + if (shape_inferenceable) { + PropagateShapeOnNodeByRunningIt(node, tensor_types); + } else { + setDynamicType(node); } } From f3d72b2101d89fe9a9ab06e67e4c4af12d2e730f Mon Sep 17 00:00:00 2001 From: Di Yu Date: Sun, 22 Jul 2018 00:08:34 -0700 Subject: [PATCH 20/23] Modify barrier net to allow better control over its initialization and execution in DPM (#9665) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9665 In data_parallel_model, we isolate synchronizing barrier init net into its own from the param_init_net, so that we could have finer granularity of control over the barrier net. Reviewed By: andrewwdye Differential Revision: D8375389 fbshipit-source-id: ce0c8c1c8e4bd82b7078a1b07abaced3f149d578 --- caffe2/python/data_parallel_model.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index b49a7952710f3..60e5c39bed131 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -687,15 +687,18 @@ def _InitializeParamUpdate(gpu_id): ] _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec) +def CreateNet(model, overwrite=False): + for net_iters in model._data_parallel_model_nets: + if isinstance(net_iters, tuple): + workspace.CreateNet(net_iters[0], overwrite=overwrite) + else: + workspace.CreateNet(net_iters, overwrite=overwrite) + def RunInitNet(model): for init_net in model._data_parallel_model_init_nets: workspace.RunNetOnce(init_net) - for net_iters in model._data_parallel_model_nets: - if isinstance(net_iters, tuple): - workspace.CreateNet(net_iters[0]) - else: - workspace.CreateNet(net_iters) + CreateNet(model) def RunWarmup(model): @@ -720,8 +723,14 @@ def _AddBarrierToModelNets(model, barrier_net_timeout_sec): # (_DEFAULT_TIMEOUT_SEC). # We pass in model.param_init_net so that the barrier net can be run as # part of the param_init_net. - model._barrier_net = _CreateBarrierNet(model, model.param_init_net, - "pre_training", barrier_net_timeout_sec) + + model._barrier_init_net = core.Net("barrier_init_net") + + model._barrier_net = _CreateBarrierNet(model, model._barrier_init_net, + "pre_training", barrier_net_timeout_sec) + + model._data_parallel_model_init_nets.insert(0, model._barrier_init_net) + model._data_parallel_model_nets.insert(0, model._barrier_net) From 1afdc57ed89984b704d0dc5357851cd9f8c50553 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Sun, 22 Jul 2018 09:02:58 -0700 Subject: [PATCH 21/23] Hide all other fields in THTensor (#9683) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9683 This pops off `refcount_`, `storage_`, `storage_offset_`; there are now no more direct accesses to these fields and we can make them private (with appropriate friending). Stacked on #9561 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9591 Reviewed By: SsnL Differential Revision: D8922246 Pulled By: ezyang fbshipit-source-id: dfae023d790e29ce652e2eab9a1628bbe97b318d --- aten/src/ATen/templates/TensorDense.cpp | 2 +- aten/src/ATen/templates/TensorDerived.cpp | 10 ++- aten/src/TH/THTensor.cpp | 9 +-- aten/src/TH/THTensor.h | 5 ++ aten/src/TH/THTensor.hpp | 48 +++++++++--- aten/src/TH/THTensorApply.h | 14 ++-- aten/src/TH/THTensorDimApply.h | 12 +-- aten/src/TH/generic/THTensor.cpp | 75 +++++++++---------- aten/src/TH/generic/THTensorFastGetSet.hpp | 20 ++--- aten/src/TH/generic/THTensorLapack.cpp | 8 +- aten/src/TH/generic/THTensorMath.cpp | 4 +- aten/src/TH/generic/THTensorRandom.cpp | 34 ++++----- aten/src/THC/THCTensor.cpp | 43 ++++++----- aten/src/THC/generic/THCTensor.cpp | 39 +++++----- aten/src/THC/generic/THCTensorCopy.cpp | 4 +- aten/src/THC/generic/THCTensorCopy.cu | 2 +- aten/src/THC/generic/THCTensorMath.cu | 4 +- aten/src/THC/generic/THCTensorMathMagma.cu | 10 +-- .../THCUNN/generic/SpatialConvolutionLocal.cu | 16 ++-- .../THCUNN/generic/SpatialConvolutionMM.cu | 6 +- .../src/THCUNN/generic/TemporalConvolution.cu | 48 ++++++------ .../THCUNN/generic/TemporalRowConvolution.cu | 6 +- aten/src/THNN/generic/SparseLinear.c | 6 +- .../THNN/generic/SpatialConvolutionLocal.c | 16 ++-- aten/src/THNN/generic/SpatialConvolutionMM.c | 14 ++-- aten/src/THNN/generic/TemporalConvolution.c | 48 ++++++------ .../src/THNN/generic/TemporalRowConvolution.c | 14 ++-- .../THNN/generic/VolumetricConvolutionMM.c | 14 ++-- 28 files changed, 279 insertions(+), 252 deletions(-) diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp index 9f977d50ead2b..1ca2cda09fa7d 100644 --- a/aten/src/ATen/templates/TensorDense.cpp +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -8,7 +8,7 @@ IntList ${Tensor}::strides() const { Scalar ${Tensor}::localScalar() { int64_t numel = ${THTensor}_nElement(${state,}tensor); AT_CHECK(numel == 1,"a Tensor with ", numel, " elements cannot be converted to Scalar"); - return Scalar(${to_at_type}(${THStorage}_get(${state,}tensor->storage, tensor->storageOffset))); + return Scalar(${to_at_type}(${THStorage}_get(${state,} THTensor_getStoragePtr(tensor), tensor->storage_offset()))); } std::unique_ptr ${Tensor}::storage() { auto storage = ${THTensor}_storage(${state,}tensor); diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp index d89e84ee5e702..70f2cc260b62f 100644 --- a/aten/src/ATen/templates/TensorDerived.cpp +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -22,8 +22,9 @@ namespace at { : TensorImpl(&context->getType(Backend::${Backend},ScalarType::${ScalarName})), tensor(tensor), context(context) {} + ${Tensor}::~${Tensor}() { - ${THTensor}_free(${state,} tensor); + if (tensor) tensor->release(); } const char * ${Tensor}::toString() const { @@ -46,13 +47,14 @@ const char * ${Tensor}::typeString() { return "${Type}"; } void * ${Tensor}::unsafeGetTH(bool retain) { - if (retain) - ${THTensor}_retain(${state,} tensor); + if (retain) { + tensor->retain(); + } return tensor; } void ${Tensor}::release_resources() { - ${THTensor}_free(${state,} tensor); + tensor->release(); tensor = nullptr; } diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 48ddcd2c57ba1..5c6bdb48bd936 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -40,13 +40,8 @@ void THTensor_free(THTensor *self) { - if(!self) - return; - - if(--self->refcount == 0) - { - delete self; - } + if (!self) return; + self->release(); } // On a high level, diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h index 3335a6f5d8cc5..217d1fdce1253 100644 --- a/aten/src/TH/THTensor.h +++ b/aten/src/TH/THTensor.h @@ -6,6 +6,11 @@ #define THTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME) +#ifdef __cplusplus +struct THTensor; +THStorage* THTensor_getStoragePtr(const THTensor* tensor); +#endif + /* basics */ #include "generic/THTensor.h" #include "THGenerateAllTypes.h" diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index bc9f23ee5e3ce..0962803e3821a 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -12,26 +12,26 @@ struct THTensor { THTensor(THStorage* storage) - : refcount(1) - , storage(storage) - , storageOffset(0) + : refcount_(1) + , storage_(storage) + , storage_offset_(0) , sizes_{0} , strides_{1} , dim_(1) {} ~THTensor() { - if (storage) { - THStorage_free(storage); + if (storage_) { + THStorage_free(storage_); } } - std::atomic refcount; + std::atomic refcount_; // Note: storage->size may be greater than the recorded size // of a tensor - THStorage *storage; - ptrdiff_t storageOffset; + THStorage *storage_; + ptrdiff_t storage_offset_; std::vector sizes_; std::vector strides_; @@ -39,12 +39,12 @@ struct THTensor template inline T * data() const { - return storage->data() + storageOffset; + return storage_->data() + storage_offset_; } template inline T * unsafe_data() const { - return storage->unsafe_data() + storageOffset; + return storage_->unsafe_data() + storage_offset_; } // [NOTE: _dim() vs dim()] @@ -58,6 +58,10 @@ struct THTensor return dim_; } + ptrdiff_t storage_offset() const { + return storage_offset_; + } + // represents that numel() == 0. inline bool is_empty() const { for (int64_t i = 0; i < dim_; ++i) { @@ -85,6 +89,16 @@ struct THTensor inline at::IntList strides() { return strides_; } + + void retain() { + ++refcount_; + } + + void release() { + if(--refcount_ == 0) { + delete this; + } + } }; #include "generic/THTensorFastGetSet.hpp" @@ -120,6 +134,20 @@ inline void THTensor_setStrideAtDim(THTensor* tensor, int dim, int64_t new_strid tensor->strides_[dim] = new_stride; } +inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset) { + tensor->storage_offset_ = storage_offset; +} + +// NB: Non-retaining +inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { + return tensor->storage_; +} + +// NB: Steals ownership of storage +inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { + tensor->storage_ = storage; +} + TH_API void THTensor_free(THTensor *self); at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape); diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h index 514a4969df83e..7cf4f41c924ee 100644 --- a/aten/src/TH/THTensorApply.h +++ b/aten/src/TH/THTensorApply.h @@ -43,7 +43,7 @@ TH_TENSOR_APPLY_hasFinished = 1; \ else \ { \ - TENSOR##_data = TENSOR->storage->data()+TENSOR->storageOffset; \ + TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data()+TENSOR->storage_offset(); \ TENSOR##_size = 1; \ TENSOR##_stride = 1; \ for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \ @@ -321,7 +321,7 @@ ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR); \ if(TENSOR##Contg){ \ ptrdiff_t iter = 0; \ - TYPE *rp = TENSOR->storage->data()+TENSOR->storageOffset; \ + TYPE *rp = THTensor_getStoragePtr(TENSOR)->data()+TENSOR->storage_offset(); \ PRAGMA( omp parallel for if (TENSOR##Size > OMP_THRESHOLD * 10) firstprivate(rp) reduction(OPERATION) ) \ for (iter = 0; iter < TENSOR##Size; iter++) { \ TYPE *TENSOR##_data = rp+iter; \ @@ -365,8 +365,8 @@ { \ /* for advanced searching index*/ \ if( CONTIG1 && CONTIG2 ){ \ - TYPE1 *rp = TENSOR1->storage->data()+TENSOR1->storageOffset; \ - TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ + TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data()+TENSOR1->storage_offset(); \ + TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data()+TENSOR2->storage_offset(); \ ptrdiff_t iter = 0; \ if(tp != (TYPE2*)rp) { \ PRAGMA(ivdep) \ @@ -444,9 +444,9 @@ { \ /* for adveanced searching index*/ \ if(CONTIG1 && CONTIG2 && CONTIG3){ \ - TYPE1 *rp = TENSOR1->storage->data()+TENSOR1->storageOffset; \ - TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ - TYPE3 *srcp = TENSOR3->storage->data()+TENSOR3->storageOffset; \ + TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data()+TENSOR1->storage_offset(); \ + TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data()+TENSOR2->storage_offset(); \ + TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data()+TENSOR3->storage_offset(); \ ptrdiff_t iter = 0;\ if(tp != (TYPE2*)rp) { \ PRAGMA(ivdep) \ diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h index e85bd0e9137e8..e45db37acf07a 100644 --- a/aten/src/TH/THTensorDimApply.h +++ b/aten/src/TH/THTensorDimApply.h @@ -60,15 +60,15 @@ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ - TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ + TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \ TENSOR1##_size = TENSOR1->size(DIMENSION); \ \ - TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ + TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data()+(TENSOR2)->storage_offset(); \ TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \ TENSOR2##_size = TENSOR2->size(DIMENSION); \ \ - TENSOR3##_data = (TENSOR3)->storage->data()+(TENSOR3)->storageOffset; \ + TENSOR3##_data = THTensor_getStoragePtr(TENSOR3)->data()+(TENSOR3)->storage_offset(); \ TENSOR3##_stride = (TENSOR3)->stride(DIMENSION); \ TENSOR3##_size = TENSOR3->size(DIMENSION); \ \ @@ -167,11 +167,11 @@ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ - TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ + TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \ TENSOR1##_size = TENSOR1->size(DIMENSION); \ \ - TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ + TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data()+(TENSOR2)->storage_offset(); \ TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \ TENSOR2##_size = TENSOR2->size(DIMENSION); \ \ @@ -269,7 +269,7 @@ if( (DIMENSION < 0) || (DIMENSION >= TENSOR->_dim()) ) \ THError("invalid dimension"); \ \ - TENSOR##_data = (TENSOR)->storage->data()+(TENSOR)->storageOffset; \ + TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data()+(TENSOR)->storage_offset(); \ TENSOR##_stride = (TENSOR)->stride(DIMENSION); \ TENSOR##_size = TENSOR->size(DIMENSION); \ /* Counter stores the indices into the Tensor at any time */ \ diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 92314de69bae2..c281e916c58f0 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -7,12 +7,12 @@ /**** access methods ****/ THStorage *THTensor_(storage)(const THTensor *self) { - return self->storage; + return THTensor_getStoragePtr(self); } ptrdiff_t THTensor_(storageOffset)(const THTensor *self) { - return self->storageOffset; + return self->storage_offset(); } int THTensor_(nDimension)(const THTensor *self) @@ -53,12 +53,8 @@ THLongStorage *THTensor_(newStrideOf)(THTensor *self) return stride; } -real *THTensor_(data)(const THTensor *self) -{ - if(self->storage) - return (THStorage_(data)(self->storage)+self->storageOffset); - else - return NULL; +real *THTensor_(data)(const THTensor *self) { + return self->data(); } /**** creation methods ****/ @@ -74,8 +70,8 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor) { THTensor *self = new THTensor(THStorage_(new)()); THTensor_(setStorageNd)(self, - tensor->storage, - tensor->storageOffset, + THTensor_getStoragePtr(tensor), + tensor->storage_offset(), tensor->dim(), THTensor_getSizePtr(tensor), THTensor_getStridePtr(tensor)); @@ -238,7 +234,7 @@ THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size) auto stride_value = *stride; THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size()); THLongStorage_rawCopy(new_stride, stride_value.data()); - THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, new_stride); + THTensor_(setStorage)(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), inferred_size, new_stride); THLongStorage_free(inferred_size); THLongStorage_free(new_stride); return self; @@ -297,8 +293,8 @@ void THTensor_(set)(THTensor *self, THTensor *src) { if(self != src) THTensor_(setStorageNd)(self, - src->storage, - src->storageOffset, + THTensor_getStoragePtr(src), + src->storage_offset(), src->dim(), THTensor_getSizePtr(src), THTensor_getStridePtr(src)); @@ -386,8 +382,9 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir THTensor_(set)(self, src); - if(firstIndex > 0) - self->storageOffset += firstIndex*self->stride(dimension); + if (firstIndex > 0) { + THTensor_setStorageOffset(self, self->storage_offset() + firstIndex*self->stride(dimension)); + } THTensor_setSizeAtDim(self, dimension, size); } @@ -640,10 +637,10 @@ int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src) int THTensor_(isSetTo)(const THTensor *self, const THTensor* src) { - if (!self->storage) + if (!THTensor_getStoragePtr(self)) return 0; - if (self->storage == src->storage && - self->storageOffset == src->storageOffset && + if (THTensor_getStoragePtr(self) == THTensor_getStoragePtr(src) && + self->storage_offset() == src->storage_offset() && self->_dim() == src->_dim()) { int d; @@ -673,7 +670,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self) void THTensor_(retain)(THTensor *self) { - ++self->refcount; + self->retain(); } void THTensor_(free)(THTensor *self) @@ -694,24 +691,24 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst) void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) { /* storage */ - if(self->storage != storage) + if(THTensor_getStoragePtr(self) != storage) { - if(self->storage) - THStorage_(free)(self->storage); + if(THTensor_getStoragePtr(self)) + THStorage_(free)(THTensor_getStoragePtr(self)); if(storage) { - self->storage = storage; - THStorage_(retain)(self->storage); + THTensor_stealAndSetStoragePtr(self, storage); + THStorage_(retain)(THTensor_getStoragePtr(self)); } else - self->storage = THStorage_(new)(); + THTensor_stealAndSetStoragePtr(self, THStorage_(new)()); } /* storageOffset */ if(storageOffset < 0) THError("Tensor: invalid storage offset"); - self->storageOffset = storageOffset; + THTensor_setStorageOffset(self, storageOffset); /* size and stride */ THTensor_(resizeNd)(self, nDimension, size, stride); @@ -780,13 +777,13 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t totalSize += (self->size(d)-1)*self->stride(d); } - if(totalSize+self->storageOffset > 0) + if(totalSize+self->storage_offset() > 0) { - if(!self->storage) { - self->storage = THStorage_(new)(); + if(!THTensor_getStoragePtr(self)) { + THTensor_stealAndSetStoragePtr(self, THStorage_(new)()); } - if(totalSize+self->storageOffset > self->storage->size) { - THStorage_(resize)(self->storage, totalSize+self->storageOffset); + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size) { + THStorage_(resize)(THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } } @@ -795,56 +792,56 @@ void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value) { THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value); + THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THTensor_(get1d)(const THTensor *tensor, int64_t x0) { THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)); + return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value) { THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value); + THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value); } real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1) { THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)); + return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)); } void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) { THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); + THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); } real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2) { THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); + return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); } void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); + THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); } real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); + return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); } THDescBuff THTensor_(desc)(const THTensor *tensor) { diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp index fa989ddafaf40..5ad3e13b237b8 100644 --- a/aten/src/TH/generic/THTensorFastGetSet.hpp +++ b/aten/src/TH/generic/THTensorFastGetSet.hpp @@ -3,43 +3,43 @@ #else static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)]; + return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)]; } static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)]; + return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)]; } static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)]; + return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)]; } static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)]; + return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)]; } static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)]; + return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)]; } static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)] = value; + (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)] = value; } static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)] = value; + (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)] = value; } static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value; + (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value; } static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value; + (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value; } static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value; + (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value; } #endif diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp index 4793dec43de2a..1556296262303 100644 --- a/aten/src/TH/generic/THTensorLapack.cpp +++ b/aten/src/TH/generic/THTensorLapack.cpp @@ -118,7 +118,7 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) "rows, B has %ld", a->size(0), b->size(0)); if (b->dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0), b->stride(0), 1, 0); free_b = 1; } @@ -171,7 +171,7 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a, "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0), b->stride(0), 1, 0); free_b = 1; } @@ -221,7 +221,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0), b->stride(0), 1, 0); free_b = 1; } @@ -644,7 +644,7 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo) "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0), b->stride(0), 1, 0); free_b = 1; } diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index e4152432a3068..1ea91f7089949 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -3674,12 +3674,12 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int // Second path for non-contiguous int64_t offset; if (dimension == 0 && allContiguous) { - real* result_data = THStorage_(data)(result->storage) + result->storageOffset; + real* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset(); offset = 0; for (int j = 0; j < numInputs; j++) { if (!should_skip(inputs[j])) { THTensor* input0 = inputs[j]; - real* input0_data = THStorage_(data)(input0->storage) + input0->storageOffset; + real* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset(); int64_t input0_size = THTensor_(nElement)(input0); // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this. if (input0_size != 0) { diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index ceb927429573f..d4dc8fb42b344 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -194,7 +194,7 @@ void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, dou std::lock_guard lock(_generator->mutex); const int64_t size = THTensor_(numel)(self); if (size >= 16 && THTensor_(isContiguous)(self)) { - THVector_(normal_fill)(THStorage_(data)(self->storage), size, _generator, mean, stddev); + THVector_(normal_fill)(THStorage_(data)(THTensor_getStoragePtr(self)), size, _generator, mean, stddev); } else { TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stddev);); } @@ -398,8 +398,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso for (j=0; jstorage, \ - prob_dist->storageOffset+i*prob_dist->stride(0)+j*prob_dist->stride(1) \ + THTensor_getStoragePtr(prob_dist), \ + prob_dist->storage_offset()+i*prob_dist->stride(0)+j*prob_dist->stride(1) \ ); THArgCheckWithCleanup((val >= 0), THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), @@ -411,8 +411,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); sum += val; THDoubleStorage_set( - cum_dist->storage, \ - cum_dist->storageOffset+j*cum_dist->stride(0), \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+j*cum_dist->stride(0), \ sum \ ); } @@ -448,8 +448,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso { mid_pointer = left_pointer + (right_pointer - left_pointer) / 2; cum_prob = THDoubleStorage_get( \ - cum_dist->storage, \ - cum_dist->storageOffset+mid_pointer*cum_dist->stride(0) \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+mid_pointer*cum_dist->stride(0) \ ); if (cum_prob < uniform_sample) { @@ -464,8 +464,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso /* store in result tensor (will be incremented for lua compat by wrapper) */ THLongStorage_set( \ - self->storage, \ - self->storageOffset+i*self->stride(0)+j*self->stride(1), \ + THTensor_getStoragePtr(self), \ + self->storage_offset()+i*self->stride(0)+j*self->stride(1), \ sample_idx \ ); @@ -480,22 +480,22 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso if (sample_idx != 0) { new_val = THDoubleStorage_get( \ - cum_dist->storage, \ - cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride(0) \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+(sample_idx-1)*cum_dist->stride(0) \ ); } /* marginal cumulative mass (i.e. original probability) of sample */ diff = THDoubleStorage_get( \ - cum_dist->storage, \ - cum_dist->storageOffset+sample_idx*cum_dist->stride(0) \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+sample_idx*cum_dist->stride(0) \ ) - new_val; /* new sum of marginals is not one anymore... */ sum = 1.0 - diff; for (k=0; kstorage, \ - cum_dist->storageOffset+k*cum_dist->stride(0) \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+k*cum_dist->stride(0) \ ); if (k >= sample_idx) { @@ -505,8 +505,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso /* make total marginals sum to one */ new_val /= sum; THDoubleStorage_set( \ - cum_dist->storage, \ - cum_dist->storageOffset+k*cum_dist->stride(0), \ + THTensor_getStoragePtr(cum_dist), \ + cum_dist->storage_offset()+k*cum_dist->stride(0), \ new_val \ ); } diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 13fdff6b3b566..6a599b3b655fd 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -148,13 +148,13 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_ totalSize += (self->size(d)-1)*self->stride(d); } - if(totalSize+self->storageOffset > 0) + if(totalSize+self->storage_offset() > 0) { - if(!self->storage) { + if(!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - if(totalSize+self->storageOffset > self->storage->size) { - THCStorage_resize(state, self->storage, totalSize+self->storageOffset); + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size) { + THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } } @@ -164,8 +164,8 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src) if(self != src) THCTensor_setStorageNd(state, self, - src->storage, - src->storageOffset, + THTensor_getStoragePtr(src), + src->storage_offset(), src->dim(), THTensor_getSizePtr(src), THTensor_getStridePtr(src)); @@ -174,27 +174,27 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src) void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) { /* storage */ - if(self->storage != storage) + if(THTensor_getStoragePtr(self) != storage) { - if (!self->storage) { + if (!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - auto scalar_type = self->storage->scalar_type; - THStorage_free(self->storage); + auto scalar_type = THTensor_getStoragePtr(self)->scalar_type; + THStorage_free(THTensor_getStoragePtr(self)); - if(storage) - { - self->storage = storage; - THStorage_retain(self->storage); + if (storage) { + THTensor_stealAndSetStoragePtr(self, storage); + THStorage_retain(THTensor_getStoragePtr(self)); + } else { + THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type)); } - else - self->storage = THCStorage_new(state, scalar_type); } /* storageOffset */ - if(storageOffset < 0) + if (storageOffset < 0) { THError("Tensor: invalid storage offset"); - self->storageOffset = storageOffset; + } + THTensor_setStorageOffset(self, storageOffset); /* size and stride */ THCTensor_resizeNd(state, self, nDimension, size, stride); @@ -294,17 +294,16 @@ ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) { } void THCTensor_retain(THCState *state, THCTensor *self) { - self->refcount++; + self->retain(); } - void THCTensor_free(THCState *state, THCTensor *self) { THTensor_free(self); } int THCTensor_getDevice(THCState* state, const THCTensor* tensor) { - if (!tensor->storage) return -1; - return THCStorage_getDevice(state, tensor->storage); + if (!THTensor_getStoragePtr(tensor)) return -1; + return THCStorage_getDevice(state, THTensor_getStoragePtr(tensor)); } bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs) { diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index 3b03e37232ef0..023a55d176be8 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -5,12 +5,12 @@ /**** access methods ****/ THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self) { - return self->storage; + return THTensor_getStoragePtr(self); } ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self) { - return self->storageOffset; + return self->storage_offset(); } int THCTensor_(nDimension)(THCState *state, const THCTensor *self) @@ -47,8 +47,8 @@ THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self) real *THCTensor_(data)(THCState *state, const THCTensor *self) { - if(self->storage) - return (THCStorage_(data)(state, self->storage)+self->storageOffset); + if(THTensor_getStoragePtr(self)) + return (THCStorage_(data)(state, THTensor_getStoragePtr(self))+self->storage_offset()); else return NULL; } @@ -67,8 +67,8 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) THCTensor *self = new THCTensor(THCStorage_(new)(state)); THCTensor_(setStorageNd)(state, self, - tensor->storage, - tensor->storageOffset, + THTensor_getStoragePtr(tensor), + tensor->storage_offset(), tensor->dim(), THTensor_getSizePtr(tensor), THTensor_getStridePtr(tensor)); @@ -227,7 +227,7 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage auto stride_value = *stride; THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size()); THLongStorage_rawCopy(new_stride, stride_value.data()); - THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, new_stride); + THCTensor_(setStorage)(state, self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), inferred_size, new_stride); THLongStorage_free(inferred_size); THLongStorage_free(new_stride); return self; @@ -376,8 +376,9 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di THCTensor_(set)(state, self, src); - if(firstIndex > 0) - self->storageOffset += firstIndex*self->stride(dimension); + if (firstIndex > 0) { + THTensor_setStorageOffset(self, self->storage_offset() + firstIndex*self->stride(dimension)); + } THTensor_setSizeAtDim(self, dimension, size); } @@ -537,8 +538,8 @@ int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStora int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src) { - if (self->storage == src->storage && - self->storageOffset == src->storageOffset && + if (THTensor_getStoragePtr(self) == THTensor_getStoragePtr(src) && + self->storage_offset() == src->storage_offset() && self->dim() == src->dim()) { int d; @@ -604,56 +605,56 @@ void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real valu { THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value); + THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0) { THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)); + return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value) { THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value); + THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value); } real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1) { THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)); + return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)); } void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) { THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); + THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); } real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2) { THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); + return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); } void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); + THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); } real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); + return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); } int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...) diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp index 5715133a44f6c..f108ca10015c2 100644 --- a/aten/src/THC/generic/THCTensorCopy.cpp +++ b/aten/src/THC/generic/THCTensorCopy.cpp @@ -131,7 +131,7 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor cudaMemcpyHostToDevice, THCStream_stream(stream))); - THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream)); + THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(THTensor_getStoragePtr(src)), stream)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(currentDevice)); @@ -162,7 +162,7 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor cudaMemcpyDeviceToHost, THCStream_stream(stream))); - THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, src->storage), stream)); + THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, THTensor_getStoragePtr(src)), stream)); if (currentDevice != tensorDevice) { THCudaCheck(cudaSetDevice(currentDevice)); diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 0e2630c1edf3b..71bc17ee2f739 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -10,7 +10,7 @@ THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { template <> THCTensor *THCTensor_newClone(THCState *state, THCTensor *self) { - THCTensor *tensor = THCTensor_new(state, self->storage->scalar_type); + THCTensor *tensor = THCTensor_new(state, THTensor_getStoragePtr(self)->scalar_type); THCTensor_resizeAs(state, tensor, self); THC_copyTensor(state, tensor, self); return tensor; diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index 07033fa0e8f1d..d5077655bfcdf 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -383,8 +383,8 @@ void THCTensor_(eye)(THCState *state, THCTensor *self_, int64_t n, int64_t m) int64_t stride = THCTensor_(stride)(state, self_, 0) + THCTensor_(stride)(state, self_, 1); - THCTensor *diag = THCTensor_(newWithStorage1d)(state, self_->storage, - self_->storageOffset, sz, stride); + THCTensor *diag = THCTensor_(newWithStorage1d)(state, THTensor_getStoragePtr(self_), + self_->storage_offset(), sz, stride); THCTensor_(fill)(state, diag, ScalarConvert::to(1)); THCTensor_(free)(state, diag); diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index c942a17c4c0df..dd2b2777d3552 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -12,7 +12,7 @@ static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src, int64_t stride[1] = { 1 }; THCTensor_(resizeNd)(state, self, 1, size, stride); size_t len = k * sizeof(real); - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(self)) + self->storage_offset(), src, len, cudaMemcpyHostToDevice)); } static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n) @@ -21,7 +21,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int64_t stride[2] = { 1, m }; THCTensor_(resizeNd)(state, self, 2, size, stride); size_t len = m * n * sizeof(real); - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(self)) + self->storage_offset(), src, len, cudaMemcpyHostToDevice)); } static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self) @@ -30,7 +30,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self size_t len = THCTensor_(nElement)(state, self)*sizeof(real); THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1); THCTensor *selfc = THCTensor_(newContiguous)(state, temp); - THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, selfc->storage) + selfc->storageOffset, len, cudaMemcpyDeviceToHost)); + THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, THTensor_getStoragePtr(selfc)) + selfc->storage_offset(), len, cudaMemcpyDeviceToHost)); THCTensor_(free)(state, temp); THCTensor_(free)(state, selfc); } @@ -294,8 +294,8 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T THCTensor_(resize2d)(state, re_, 2, n); THCTensor *re = THCTensor_(newContiguous)(state, re_); if (n > 0) { - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice)); - THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(re)) + re->storage_offset(), wr, n*sizeof(real), cudaMemcpyHostToDevice)); + THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(re)) + re->storage_offset() + n, wi, n*sizeof(real), cudaMemcpyHostToDevice)); } THCTensor_(freeCopyTo)(state, re, re_); THCTensor_(transpose)(state, re_, NULL, 0, 1); diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu index f22aba639a2d6..8c1c15ddc0abf 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu @@ -61,8 +61,8 @@ static THCTensor* THNN_(view_weight_local)( int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5); THCTensor *old_weight = weight; weight = THCTensor_(newWithStorage3d)(state, - weight->storage, - weight->storageOffset, + THTensor_getStoragePtr(weight), + weight->storage_offset(), s1, -1, s2, -1, s3, -1); THCTensor_(free)(state, old_weight); } @@ -140,12 +140,12 @@ void THNN_(SpatialConvolutionLocal_updateOutput)( 1, 1, THCTensor_(data)(state, finput_n) ); - output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset, + output3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(output_n), output_n->storage_offset(), outputHeight*outputWidth, 1, nOutputPlane, outputHeight*outputWidth, 1, nOutputPlane*outputHeight*outputWidth); - finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + finput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(finput_n), finput_n->storage_offset(), outputHeight*outputWidth, 1, kW*kH*nInputPlane, outputHeight*outputWidth, 1, kW*kH*nInputPlane*outputHeight*outputWidth); @@ -247,11 +247,11 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)( THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt); THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); - gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + gradOutput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(), outputHeight*outputWidth, 1, nOutputPlane, outputHeight*outputWidth, 1, nOutputPlane*outputHeight*outputWidth); - fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset, + fgradInput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(fgradInput_n), fgradInput_n->storage_offset(), outputHeight*outputWidth, 1, kW*kH*nInputPlane, outputHeight*outputWidth, 1, kW*kH*nInputPlane*outputHeight*outputWidth); @@ -358,11 +358,11 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)( THCTensor_(select)(state, finput_n, finput, 0, elt); THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); - gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + gradOutput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(), outputHeight*outputWidth, 1, nOutputPlane, outputHeight*outputWidth, 1, nOutputPlane*outputHeight*outputWidth); - finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + finput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(finput_n), finput_n->storage_offset(), outputHeight*outputWidth, 1, 1, kW*kH*nInputPlane*outputHeight*outputWidth, kW*kH*nInputPlane, outputHeight*outputWidth); diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu index e276d349d648c..fb695eb002020 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -109,7 +109,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)( if (weight->dim() == 4) { int64_t s1 = weight->size(0); int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); - weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1); freeWeight = 1; } @@ -264,7 +264,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( if (weight->dim() == 4) { int64_t s1 = weight->size(0); int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); - weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1); freeWeight = 1; } @@ -398,7 +398,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( if (gradWeight && gradWeight->dim() == 4) { int64_t s1 = gradWeight->size(0); int64_t s2 = gradWeight->size(1) * gradWeight->size(2) * gradWeight->size(3); - gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1); + gradWeight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(gradWeight), gradWeight->storage_offset(), s1, -1, s2, -1); freeWeight = 1; } diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu index 25baf933b57a9..886cf07b4e75e 100644 --- a/aten/src/THCUNN/generic/TemporalConvolution.cu +++ b/aten/src/THCUNN/generic/TemporalConvolution.cu @@ -90,13 +90,13 @@ void THNN_(TemporalConvolution_updateOutput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THCTensor_(setStorage2d)(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size(1), + THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(input), + input->storage_offset()+k*dW*input->size(1), nFrame, inputFrameStride*input->size(1), kW*input->size(1), 1); - THCTensor_(setStorage2d)(state, outputWindow, output->storage, - output->storageOffset + k*output->size(1), + THCTensor_(setStorage2d)(state, outputWindow, THTensor_getStoragePtr(output), + output->storage_offset() + k*output->size(1), nFrame, outputFrameStride*output->size(1), output->size(1), 1); @@ -138,13 +138,13 @@ void THNN_(TemporalConvolution_updateOutput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size(1), + THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(inputSample), + inputSample->storage_offset()+k*dW*inputSample->size(1), nFrame, inputFrameStride*inputSample->size(1), kW*inputSample->size(1), 1); - THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage, - outputSample->storageOffset + k*outputSample->size(1), + THCTensor_(setStorage2d)(state, outputWindow, THTensor_getStoragePtr(outputSample), + outputSample->storage_offset() + k*outputSample->size(1), nFrame, outputFrameStride*outputSample->size(1), outputSample->size(1), 1); @@ -215,13 +215,13 @@ void THNN_(TemporalConvolution_updateGradInput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size(1), + THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutput), + gradOutput->storage_offset() + k*gradOutput->size(1), nFrame, outputFrameStride*gradOutput->size(1), gradOutput->size(1), 1); - THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage, - gradInput->storageOffset+k*dW*gradInput->size(1), + THCTensor_(setStorage2d)(state, gradInputWindow, THTensor_getStoragePtr(gradInput), + gradInput->storage_offset()+k*dW*gradInput->size(1), nFrame, inputFrameStride*gradInput->size(1), kW*gradInput->size(1), 1); @@ -247,13 +247,13 @@ void THNN_(TemporalConvolution_updateGradInput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size(1), + THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutputSample), + gradOutputSample->storage_offset() + k*gradOutputSample->size(1), nFrame, outputFrameStride*gradOutputSample->size(1), gradOutputSample->size(1), 1); - THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage, - gradInputSample->storageOffset+k*dW*gradInputSample->size(1), + THCTensor_(setStorage2d)(state, gradInputWindow, THTensor_getStoragePtr(gradInputSample), + gradInputSample->storage_offset()+k*dW*gradInputSample->size(1), nFrame, inputFrameStride*gradInputSample->size(1), kW*gradInputSample->size(1), 1); @@ -324,13 +324,13 @@ void THNN_(TemporalConvolution_accGradParameters)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THCTensor_(setStorage2d)(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size(1), + THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(input), + input->storage_offset()+k*dW*input->size(1), nFrame, inputFrameStride*input->size(1), kW*input->size(1), 1); - THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size(1), + THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutput), + gradOutput->storage_offset() + k*gradOutput->size(1), nFrame, outputFrameStride*gradOutput->size(1), gradOutput->size(1), 1); @@ -367,13 +367,13 @@ void THNN_(TemporalConvolution_accGradParameters)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size(1), + THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(inputSample), + inputSample->storage_offset()+k*dW*inputSample->size(1), nFrame, inputFrameStride*inputSample->size(1), kW*inputSample->size(1), 1); - THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size(1), + THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutputSample), + gradOutputSample->storage_offset() + k*gradOutputSample->size(1), nFrame, outputFrameStride*gradOutputSample->size(1), gradOutputSample->size(1), 1); diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu index 83a32ca2a063d..208016e93fe96 100644 --- a/aten/src/THCUNN/generic/TemporalRowConvolution.cu +++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu @@ -151,7 +151,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( THCTensor_(data)(state, columns)); THCTensor *output3d = THCTensor_(newWithStorage3d)( - state, output_n->storage, output_n->storageOffset, inputFrameSize, -1, + state, THTensor_getStoragePtr(output_n), output_n->storage_offset(), inputFrameSize, -1, 1, -1, nOutputFrame, -1); // weight: inputFrameSize x 1 x kW @@ -251,7 +251,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)( THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)( - state, gradOutput_n->storage, gradOutput_n->storageOffset, + state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(), inputFrameSize, -1, 1, -1, nOutputFrame, -1); // weight: inputFrameSize x kW x 1 @@ -365,7 +365,7 @@ void THNN_(TemporalRowConvolution_accGradParameters)( THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)( - state, gradOutput_n->storage, gradOutput_n->storageOffset, + state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(), inputFrameSize, -1, 1, -1, nOutputFrame, -1); // Extract columns diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c index c7a00e335dbc6..a28d4e78477ce 100644 --- a/aten/src/THNN/generic/SparseLinear.c +++ b/aten/src/THNN/generic/SparseLinear.c @@ -30,14 +30,14 @@ static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) } static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { - THStorage_(set)(t->storage, t->storageOffset + x0*t->stride(0), value); + THStorage_(set)(THTensor_getStoragePtr(t), t->storage_offset() + x0*t->stride(0), value); } static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) { - return THStorage_(get)(t->storage, t->storageOffset + + return THStorage_(get)(THTensor_getStoragePtr(t), t->storage_offset() + x0*t->stride(0) + x1*t->stride(1) + x2*t->stride(2)); } static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) { - return THStorage_(get)(t->storage, t->storageOffset + + return THStorage_(get)(THTensor_getStoragePtr(t), t->storage_offset() + x0*t->stride(0) + x1*t->stride(1)); } diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c index 6461285ca931f..d56cb198d88bf 100644 --- a/aten/src/THNN/generic/SpatialConvolutionLocal.c +++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c @@ -57,8 +57,8 @@ static THTensor* THNN_(view_weight_local)(THTensor *_weight) int64_t s2 = weight->size(2); int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5); THTensor *old_weight = weight; - weight = THTensor_(newWithStorage3d)(weight->storage, - weight->storageOffset, + weight = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(weight), + weight->storage_offset(), s1, -1, s2, -1, s3, -1); THTensor_(free)(old_weight); } @@ -82,13 +82,13 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame) THTensor_(copy)(output, bias); output3d = THTensor_(newWithStorage3d) - (output->storage, output->storageOffset, + (THTensor_getStoragePtr(output), output->storage_offset(), outputHeight * outputWidth, 1, nOutputPlane, outputHeight * outputWidth, 1, nOutputPlane * outputHeight * outputWidth); finput3d = THTensor_(newWithStorage3d) - (finput->storage, finput->storageOffset, + (THTensor_getStoragePtr(finput), finput->storage_offset(), outputHeight * outputWidth, 1, kW * kH * nInputPlane, outputHeight * outputWidth, 1, kW * kH * nInputPlane * outputHeight * outputWidth); @@ -178,11 +178,11 @@ static void THNN_(SpatialConvolutionLocal_updateGradInput_frame) int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight) { THTensor *gradOutput3d, *fgradInput3d; - gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + gradOutput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), outputHeight*outputWidth, 1, nOutputPlane, outputHeight*outputWidth, 1, nOutputPlane*outputHeight*outputWidth); - fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset, + fgradInput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(fgradInput), fgradInput->storage_offset(), outputHeight*outputWidth, 1, kW*kH*nInputPlane, outputHeight*outputWidth, 1, kW*kH*nInputPlane*outputHeight*outputWidth); @@ -280,11 +280,11 @@ static void THNN_(SpatialConvolutionLocal_accGradParameters_frame) { THTensor *gradOutput3d, *finput3d; - gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + gradOutput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), outputHeight*outputWidth, 1, nOutputPlane, outputHeight*outputWidth, 1, nOutputPlane*outputHeight*outputWidth); - finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset, + finput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(finput), finput->storage_offset(), outputHeight*outputWidth, 1, 1, kW*kH*nInputPlane*outputHeight*outputWidth, kW*kH*nInputPlane, outputHeight*outputWidth); diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c index 434320a3cfdfd..7547dbf12e059 100644 --- a/aten/src/THNN/generic/SpatialConvolutionMM.c +++ b/aten/src/THNN/generic/SpatialConvolutionMM.c @@ -84,7 +84,7 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) { int64_t s1 = weight->size(0); int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); THTensor *old_weight = weight; - weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, + weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1); THTensor_(free)(old_weight); } @@ -117,13 +117,13 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)( nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); - output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, + output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(), nOutputPlane, -1, outputHeight*outputWidth, -1); if (bias) { for(i = 0; i < nOutputPlane; i++) THVector_(fill) - (THStorage_(data)(output->storage) + output->storageOffset + output->stride(0) * i, + (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); } else { THTensor_(zero)(output); @@ -228,7 +228,7 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)( int padH) { THTensor *gradOutput2d = THTensor_(newWithStorage2d) - (gradOutput->storage, gradOutput->storageOffset, + (THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), gradOutput->size(0), -1, gradOutput->size(1)*gradOutput->size(2), -1); THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); @@ -318,7 +318,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( { int64_t i; THTensor *gradOutput2d = THTensor_(newWithStorage2d) - (gradOutput->storage, gradOutput->storageOffset, + (THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), gradOutput->size(0), -1, gradOutput->size(1)*gradOutput->size(2), -1); @@ -334,10 +334,10 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( { int64_t k; real sum = 0; - real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0); + real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput2d)) + gradOutput2d->storage_offset() + i*gradOutput2d->stride(0); for(k = 0; k < gradOutput2d->size(1); k++) sum += data[k]; - (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum; + (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i] += scale*sum; } } diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c index 2c3e1da84de5e..00ddcb226fe26 100644 --- a/aten/src/THNN/generic/TemporalConvolution.c +++ b/aten/src/THNN/generic/TemporalConvolution.c @@ -88,13 +88,13 @@ void THNN_(TemporalConvolution_updateOutput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THTensor_(setStorage2d)(inputWindow, input->storage, - input->storageOffset+k*dW*input->size(1), + THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(input), + input->storage_offset()+k*dW*input->size(1), nFrame, inputFrameStride*input->size(1), kW*input->size(1), 1); - THTensor_(setStorage2d)(outputWindow, output->storage, - output->storageOffset + k*output->size(1), + THTensor_(setStorage2d)(outputWindow, THTensor_getStoragePtr(output), + output->storage_offset() + k*output->size(1), nFrame, outputFrameStride*output->size(1), output->size(1), 1); @@ -136,13 +136,13 @@ void THNN_(TemporalConvolution_updateOutput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THTensor_(setStorage2d)(inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size(1), + THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(inputSample), + inputSample->storage_offset()+k*dW*inputSample->size(1), nFrame, inputFrameStride*inputSample->size(1), kW*inputSample->size(1), 1); - THTensor_(setStorage2d)(outputWindow, outputSample->storage, - outputSample->storageOffset + k*outputSample->size(1), + THTensor_(setStorage2d)(outputWindow, THTensor_getStoragePtr(outputSample), + outputSample->storage_offset() + k*outputSample->size(1), nFrame, outputFrameStride*outputSample->size(1), outputSample->size(1), 1); @@ -210,13 +210,13 @@ void THNN_(TemporalConvolution_updateGradInput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size(1), + THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutput), + gradOutput->storage_offset() + k*gradOutput->size(1), nFrame, outputFrameStride*gradOutput->size(1), gradOutput->size(1), 1); - THTensor_(setStorage2d)(gradInputWindow, gradInput->storage, - gradInput->storageOffset+k*dW*gradInput->size(1), + THTensor_(setStorage2d)(gradInputWindow, THTensor_getStoragePtr(gradInput), + gradInput->storage_offset()+k*dW*gradInput->size(1), nFrame, inputFrameStride*gradInput->size(1), kW*gradInput->size(1), 1); @@ -243,13 +243,13 @@ void THNN_(TemporalConvolution_updateGradInput)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size(1), + THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutputSample), + gradOutputSample->storage_offset() + k*gradOutputSample->size(1), nFrame, outputFrameStride*gradOutputSample->size(1), gradOutputSample->size(1), 1); - THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage, - gradInputSample->storageOffset+k*dW*gradInputSample->size(1), + THTensor_(setStorage2d)(gradInputWindow, THTensor_getStoragePtr(gradInputSample), + gradInputSample->storage_offset()+k*dW*gradInputSample->size(1), nFrame, inputFrameStride*gradInputSample->size(1), kW*gradInputSample->size(1), 1); @@ -319,13 +319,13 @@ void THNN_(TemporalConvolution_accGradParameters)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputFrame -= nFrame; - THTensor_(setStorage2d)(inputWindow, input->storage, - input->storageOffset+k*dW*input->size(1), + THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(input), + input->storage_offset()+k*dW*input->size(1), nFrame, inputFrameStride*input->size(1), kW*input->size(1), 1); - THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size(1), + THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutput), + gradOutput->storage_offset() + k*gradOutput->size(1), nFrame, outputFrameStride*gradOutput->size(1), gradOutput->size(1), 1); @@ -362,13 +362,13 @@ void THNN_(TemporalConvolution_accGradParameters)( int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; nOutputSampleFrame -= nFrame; - THTensor_(setStorage2d)(inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size(1), + THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(inputSample), + inputSample->storage_offset()+k*dW*inputSample->size(1), nFrame, inputFrameStride*inputSample->size(1), kW*inputSample->size(1), 1); - THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size(1), + THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutputSample), + gradOutputSample->storage_offset() + k*gradOutputSample->size(1), nFrame, outputFrameStride*gradOutputSample->size(1), gradOutputSample->size(1), 1); diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c index 5a85065058670..b623e5a2ad7fd 100644 --- a/aten/src/THNN/generic/TemporalRowConvolution.c +++ b/aten/src/THNN/generic/TemporalRowConvolution.c @@ -148,7 +148,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)( int64_t i; THTensor *output3d = THTensor_(newWithStorage3d)( - output->storage, output->storageOffset, + THTensor_getStoragePtr(output), output->storage_offset(), inputFrameSize, -1, 1, -1, nOutputFrame, -1); @@ -161,7 +161,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)( if (bias != NULL) { for (i = 0; i < inputFrameSize; i++) THVector_(fill) - (THStorage_(data)(output->storage) + output->storageOffset + (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i, THTensor_(get1d)(bias, i), nOutputFrame); } @@ -261,7 +261,7 @@ static void THNN_(TemporalRowConvolution_updateGradInput_frame)( int64_t nOutputFrame) { THTensor *gradOutput3d = THTensor_(newWithStorage3d)( - gradOutput->storage, gradOutput->storageOffset, + THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), inputFrameSize, -1, 1, -1, nOutputFrame, -1); @@ -372,7 +372,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( int64_t i; THTensor *gradOutput3d = THTensor_(newWithStorage3d)( - gradOutput->storage, gradOutput->storageOffset, + THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), gradOutput->size(0), -1, 1, -1, gradOutput->size(1), -1); @@ -389,13 +389,13 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( for (i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; - real *data = THStorage_(data)(gradOutput3d->storage) - + gradOutput3d->storageOffset + real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d)) + + gradOutput3d->storage_offset() + i * gradOutput3d->stride(0); for (k = 0; k < gradOutput3d->size(2); k++) { sum += data[k]; } - (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] + (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i] += scale * sum; } } diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c index 525fa5928edc6..9f1c83ce8af3a 100644 --- a/aten/src/THNN/generic/VolumetricConvolutionMM.c +++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c @@ -116,7 +116,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight) int64_t s1 = weight->size(0); int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); THTensor *old_weight = weight; - weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, + weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1); THTensor_(free)(old_weight); } @@ -427,7 +427,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( ); output2d = THTensor_(newWithStorage2d)( - output->storage, output->storageOffset, nOutputPlane, -1, + THTensor_getStoragePtr(output), output->storage_offset(), nOutputPlane, -1, outputDepth*outputHeight*outputWidth, -1 ); @@ -435,7 +435,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( for (i = 0; i < nOutputPlane; i++) { THVector_(fill)( - THStorage_(data)(output->storage)+output->storageOffset+output->stride(0)*i, + THStorage_(data)(THTensor_getStoragePtr(output))+output->storage_offset()+output->stride(0)*i, THTensor_(get1d)(bias, i), outputDepth*outputHeight*outputWidth ); @@ -570,7 +570,7 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)( int pH) { THTensor *gradOutput2d = THTensor_(newWithStorage2d)( - gradOutput->storage, gradOutput->storageOffset, + THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), gradOutput->size(0), -1, gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1 ); @@ -676,7 +676,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( { int64_t i; THTensor *gradOutput2d = THTensor_(newWithStorage2d)( - gradOutput->storage, gradOutput->storageOffset, + THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(), gradOutput->size(0), -1, gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1 ); @@ -693,11 +693,11 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( { int64_t k; real sum = 0; - real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0); + real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput2d)) + gradOutput2d->storage_offset() + i*gradOutput2d->stride(0); for (k = 0; k < gradOutput2d->size(1); k++) sum += data[k]; - (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum; + (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i] += scale * sum; } } From 9ee513365121cd387e11987c66db6599ac53ded7 Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Sun, 22 Jul 2018 20:08:56 -0700 Subject: [PATCH 22/23] Fix dataloader hang when it is not completely iterated (#9655) Summary: second trial of https://github.com/pytorch/pytorch/pull/7140 cc csarofeen Let's see if this works. It passes everything locally. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9655 Differential Revision: D8940177 Pulled By: SsnL fbshipit-source-id: 8d6340fc9f7355c71e1e26b262da166402faa158 --- test/test_dataloader.py | 118 +++++++++++++++++++-------------- torch/utils/data/dataloader.py | 85 ++++++++++++------------ 2 files changed, 110 insertions(+), 93 deletions(-) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index d1dabd266b878..90f927d9168f1 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -205,9 +205,12 @@ class SleepDataset(Dataset): def __init__(self, size, sleep_sec): self.size = size self.sleep_sec = sleep_sec + self.sleeped = False def __getitem__(self, idx): - time.sleep(self.sleep_sec) + if not self.sleeped: + time.sleep(self.sleep_sec) + self.sleeped = True return idx def __len__(self): @@ -251,7 +254,7 @@ def __len__(self): def _test_timeout(): - dataset = SleepDataset(10, 10) + dataset = SleepDataset(10, 3) dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1) _ = next(iter(dataloader)) @@ -474,36 +477,40 @@ def test_error_workers(self): @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_partial_workers(self): - "check that workers exit even if the iterator is not exhausted" - loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=True)) - workers = loader.workers - worker_manager_thread = loader.worker_manager_thread - for i, sample in enumerate(loader): - if i == 3: - break - del loader - for w in workers: - w.join(JOIN_TIMEOUT) - self.assertFalse(w.is_alive(), 'subprocess not terminated') - self.assertEqual(w.exitcode, 0) - worker_manager_thread.join(JOIN_TIMEOUT) - self.assertFalse(worker_manager_thread.is_alive()) + r"""Check that workers exit even if the iterator is not exhausted.""" + for pin_memory in (True, False): + loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=pin_memory)) + workers = loader.workers + if pin_memory: + pin_memory_thread = loader.pin_memory_thread + for i, sample in enumerate(loader): + if i == 10: + break + del loader + for w in workers: + w.join(JOIN_TIMEOUT) + self.assertFalse(w.is_alive(), 'subprocess not terminated') + if pin_memory: + pin_memory_thread.join(JOIN_TIMEOUT) + self.assertFalse(pin_memory_thread.is_alive()) @staticmethod - def _manager_process(dataset, worker_pids, manager_exit_event): + def _main_process(dataset, worker_pids, main_exit_event, raise_error): loader = iter(DataLoader(dataset, batch_size=2, num_workers=4, pin_memory=True)) workers = loader.workers for i in range(len(workers)): worker_pids[i] = int(workers[i].pid) for i, sample in enumerate(loader): if i == 3: - break - # Simulate a dirty exit of the manager process - manager_exit_event.set() - if IS_WINDOWS: - os.system('taskkill /PID ' + str(os.getpid()) + ' /F') - else: - os.kill(os.getpid(), signal.SIGKILL) + # Simulate an exit of the manager process + main_exit_event.set() + if raise_error: + raise RuntimeError('Error') + else: + if IS_WINDOWS: + os.system('taskkill /PID ' + str(os.getpid()) + ' /F') + else: + os.kill(os.getpid(), signal.SIGKILL) @staticmethod def _is_process_alive(pid, pname): @@ -526,33 +533,44 @@ def _is_process_alive(pid, pname): "spawn start method is not supported in Python 2, \ but we need it for creating another process with CUDA") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") - def test_manager_unclean_exit(self): - '''there might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \ + def test_main_process_unclean_exit(self): + r'''There might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \ but they are all safe to ignore''' - worker_pids = mp.Array('i', [0] * 4) - manager_exit_event = mp.Event() - p = mp.Process(target=TestDataLoader._manager_process, - args=(self.dataset, worker_pids, manager_exit_event)) - p.start() - - manager_exit_event.wait() - - exit_status = [False] * len(worker_pids) - start_time = time.time() - pname = 'python' - while True: - for i in range(len(worker_pids)): - pid = worker_pids[i] - if not exit_status[i]: - if not TestDataLoader._is_process_alive(pid, pname): - exit_status[i] = True - if all(exit_status): - break - else: - time.sleep(1) - self.assertFalse(time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT, - 'subprocess not terminated') + # `raise_error` controls if the main process is KILL-ed by OS or just + # simply raises an error. Both cases are interesting because + # 1. In case of it is KILL-ed by OS, the workers need to automatically + # discover that their parent is dead and exit gracefully. + # 2. In case of it raises an error itself, the parent process needs to + # take care of exiting the worker and then exits itself gracefully. + for raise_error in (True, False): + worker_pids = mp.Array('i', [0] * 4) + + main_exit_event = mp.Event() + p = mp.Process(target=TestDataLoader._main_process, + args=(self.dataset, worker_pids, main_exit_event, raise_error)) + p.start() + worker_pids[-1] = p.pid + + main_exit_event.wait() + + exit_status = [False] * len(worker_pids) + start_time = time.time() + pname = 'python' + while True: + for i in range(len(worker_pids)): + pid = worker_pids[i] + if not exit_status[i]: + if not TestDataLoader._is_process_alive(pid, pname): + exit_status[i] = True + if all(exit_status): + break + else: + if time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT: + self.fail('subprocess not terminated') + time.sleep(1) + p.join(MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT - (time.time() - start_time)) + self.assertFalse(p.is_alive(), 'main process not terminated') def test_len(self): def check_len(dl, expected): @@ -596,7 +614,7 @@ def __len__(self): self.assertIsInstance(batch, tt) @unittest.skipIf(not TEST_NUMPY, "numpy unavailable") - def test_default_colate_bad_numpy_types(self): + def test_default_collate_bad_numpy_types(self): import numpy as np # Should be a no-op diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 10457a6653602..60789e9fb6299 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -72,7 +72,7 @@ def is_alive(self): return os.getppid() == self.manager_pid -def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, worker_id): +def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id): global _use_shared_memory _use_shared_memory = True @@ -86,6 +86,11 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo random.seed(seed) torch.manual_seed(seed) + # Do not wait for putting thread to join when this worker exits. Otherwise, + # this worker may always be waiting to put and doesn't check index_queue + # and done_event for termination signal. + data_queue.cancel_join_thread() + if init_fn is not None: init_fn(worker_id) @@ -95,11 +100,13 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo try: r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL) except queue.Empty: - if watchdog.is_alive(): + if watchdog.is_alive() and not done_event.is_set(): continue else: break - if r is None: + # use done_event so that we can get faster exiting signal even if there + # are still indices in index_queue + if r is None or done_event.is_set(): break idx, batch_indices = r try: @@ -111,7 +118,7 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo del samples -def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id): +def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id): if pin_memory: torch.cuda.set_device(device_id) @@ -122,7 +129,7 @@ def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id) if done_event.is_set(): return raise - if r is None: + if r is None or done_event.is_set(): break if isinstance(r[1], ExceptionWrapper): out_queue.put(r) @@ -242,7 +249,6 @@ def __init__(self, loader): self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout - self.done_event = threading.Event() self.sample_iter = iter(self.batch_sampler) @@ -252,35 +258,32 @@ def __init__(self, loader): self.worker_init_fn = loader.worker_init_fn self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)] self.worker_queue_idx = 0 - self.worker_result_queue = multiprocessing.SimpleQueue() + self.worker_result_queue = multiprocessing.Queue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} + self.done_event = multiprocessing.Event() self.workers = [ multiprocessing.Process( target=_worker_loop, args=(self.dataset, self.index_queues[i], - self.worker_result_queue, self.collate_fn, base_seed + i, + self.worker_result_queue, self.done_event, + self.collate_fn, base_seed + i, self.worker_init_fn, i)) for i in range(self.num_workers)] - if self.pin_memory or self.timeout > 0: + if self.pin_memory: self.data_queue = queue.Queue() - if self.pin_memory: - maybe_device_id = torch.cuda.current_device() - else: - # do not initialize cuda context if not necessary - maybe_device_id = None - self.worker_manager_thread = threading.Thread( - target=_worker_manager_loop, + self.pin_memory_thread = threading.Thread( + target=_pin_memory_loop, args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory, - maybe_device_id)) - self.worker_manager_thread.daemon = True - self.worker_manager_thread.start() + torch.cuda.current_device())) + self.pin_memory_thread.daemon = True + self.pin_memory_thread.start() else: self.data_queue = self.worker_result_queue @@ -366,33 +369,29 @@ def __getstate__(self): raise NotImplementedError("_DataLoaderIter cannot be pickled") def _shutdown_workers(self): - try: - if not self.shutdown: - self.shutdown = True - self.done_event.set() - for q in self.index_queues: - q.put(None) - # if some workers are waiting to put, make place for them - try: - while not self.worker_result_queue.empty(): - self.worker_result_queue.get() - except (FileNotFoundError, ImportError): - # Many weird errors can happen here due to Python - # shutting down. These are more like obscure Python bugs. - # FileNotFoundError can happen when we rebuild the fd - # fetched from the queue but the socket is already closed - # from the worker side. - # ImportError can happen when the unpickler loads the - # resource from `get`. - pass - # done_event should be sufficient to exit worker_manager_thread, - # but be safe here and put another None - self.worker_result_queue.put(None) - finally: - # removes pids no matter what + if not self.shutdown: + self.shutdown = True + # removes pids from the C side data structure first so worker + # termination afterwards won't trigger false positive error report. if self.worker_pids_set: _remove_worker_pids(id(self)) self.worker_pids_set = False + self.done_event.set() + if self.pin_memory: + # Sending `None` to `pin_memory_thread` must be before + # stopping worker processes because the workers may leave + # corrupted data in `worker_result_queue`, causing + # `pin_memory_thread` unable to read and terminate properly. + self.worker_result_queue.put(None) + # Workers can't be waiting to put be cause their output queue + # is a multiprocessing.Queue and its .put is non-blocking. + # They can only be waiting to get, so we put `None` here. + for q in self.index_queues: + q.put(None) + for w in self.workers: + w.join() + if self.pin_memory: + self.pin_memory_thread.join() def __del__(self): if self.num_workers > 0: From 53083b83530441dcc08aad07886a72a04dda845e Mon Sep 17 00:00:00 2001 From: peter Date: Mon, 23 Jul 2018 06:35:39 -0700 Subject: [PATCH 23/23] Remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS and fix CUDA 8 build on Windows (#9491) (#9491) Summary: Fixes #9092. Pull Request resolved: https://github.com/pytorch/pytorch/pull/9491 Pull Request resolved: https://github.com/pytorch/pytorch/pull/9693 Differential Revision: D8946850 Pulled By: ezyang fbshipit-source-id: bd816f459ab70f6b4a0983305a1ce341bb633707 --- .gitignore | 1 + CMakeLists.txt | 4 --- aten/src/ATen/Backtrace.cpp | 1 + aten/src/ATen/Backtrace.h | 6 ++-- aten/src/ATen/Device.h | 4 +-- aten/src/ATen/ExpandUtils.h | 2 +- aten/src/ATen/Half.h | 4 +-- aten/src/ATen/SmallVector.h | 4 ++- aten/src/ATen/SparseTensorImpl.h | 2 +- aten/src/ATen/TensorOptions.h | 2 +- aten/src/ATen/cuda/CUDAStream.h | 36 ++++++++++--------- aten/src/ATen/function_wrapper.py | 2 +- aten/src/TH/THGeneral.h.in | 3 ++ aten/src/TH/THStorageClass.hpp | 2 +- aten/src/TH/THStorageFunctions.hpp | 18 +++++----- aten/src/TH/THTensor.hpp | 4 +-- caffe2/utils/proto_wrap.cc | 7 ++-- tools/autograd/templates/Functions.h | 1 + torch/csrc/autograd/engine.h | 5 +-- torch/csrc/autograd/function.h | 3 +- torch/csrc/autograd/functions/basic_ops.h | 7 ++-- torch/csrc/autograd/functions/utils.h | 7 ++-- torch/csrc/autograd/grad_mode.h | 6 ++-- torch/csrc/autograd/profiler.h | 15 ++++---- torch/csrc/autograd/saved_variable.h | 2 +- torch/csrc/autograd/variable.h | 2 +- torch/csrc/jit/autodiff.h | 9 ++--- torch/csrc/jit/export.h | 4 +-- torch/csrc/jit/graph_executor.h | 8 ++--- torch/csrc/jit/import.h | 2 +- torch/csrc/jit/interned_strings.h | 3 +- torch/csrc/jit/interpreter.h | 4 ++- torch/csrc/jit/ir.h | 23 ++++++------ torch/csrc/jit/operator.h | 4 +-- torch/csrc/jit/passes/batch_mm.h | 2 +- torch/csrc/jit/passes/canonicalize.h | 2 +- .../passes/common_subexpression_elimination.h | 2 +- .../jit/passes/create_autodiff_subgraphs.h | 2 +- torch/csrc/jit/passes/dead_code_elimination.h | 4 +-- torch/csrc/jit/passes/decompose_addmm.cpp | 1 + torch/csrc/jit/passes/decompose_addmm.h | 2 +- torch/csrc/jit/passes/erase_number_types.h | 2 +- torch/csrc/jit/passes/graph_fuser.h | 2 +- torch/csrc/jit/passes/inplace_check.h | 2 +- torch/csrc/jit/passes/loop_unrolling.h | 2 +- torch/csrc/jit/passes/lower_grad_of.h | 2 +- torch/csrc/jit/passes/lower_tuples.h | 2 +- torch/csrc/jit/passes/onnx.h | 4 +-- torch/csrc/jit/passes/peephole.h | 2 +- torch/csrc/jit/passes/remove_expands.cpp | 1 + torch/csrc/jit/passes/remove_expands.h | 2 +- torch/csrc/jit/passes/shape_analysis.h | 4 ++- torch/csrc/jit/passes/specialize_undef.h | 2 +- torch/csrc/jit/passes/to_batch.h | 6 ++-- torch/csrc/jit/script/compiler.h | 20 +++++------ torch/csrc/jit/test_jit.cpp | 2 +- torch/csrc/jit/tracer.h | 25 ++++++------- torch/csrc/jit/type.h | 19 +++++----- torch/csrc/onnx/onnx.h | 3 +- 59 files changed, 176 insertions(+), 148 deletions(-) diff --git a/.gitignore b/.gitignore index 09e5ed86bbfe5..4d684662d166c 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ torch/csrc/nn/THNN.cpp torch/csrc/nn/THNN.cwrap torch/lib/*.a* torch/lib/*.dll* +torch/lib/*.exe* torch/lib/*.dylib* torch/lib/*.h torch/lib/*.lib diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e302869a8a8a..651e230ab35ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,10 +152,6 @@ endif() # ---[ CMake scripts + modules list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) -if (MSVC AND ${BUILD_SHARED_LIBS}) - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) -endif() - # ---[ CMake build directories set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp index a145c3b75c7ec..8ee61c73bf12c 100644 --- a/aten/src/ATen/Backtrace.cpp +++ b/aten/src/ATen/Backtrace.cpp @@ -1,4 +1,5 @@ #include +#include #include #include diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h index 99f0923180583..347c430d61b75 100644 --- a/aten/src/ATen/Backtrace.h +++ b/aten/src/ATen/Backtrace.h @@ -4,9 +4,11 @@ #include #include +#include + namespace at { /// Utility to demangle a C++ symbol name. -std::string demangle(const char* name); +AT_API std::string demangle(const char* name); /// Returns the printable name of the type. template @@ -19,7 +21,7 @@ inline const char* demangle_type() { #endif // __GXX_RTTI } -std::string get_backtrace( +AT_API std::string get_backtrace( size_t frames_to_skip = 0, size_t maximum_number_of_frames = 64, bool skip_python_frames = true); diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h index 2d9c27f8d4cc3..4795b77a1bff0 100644 --- a/aten/src/ATen/Device.h +++ b/aten/src/ATen/Device.h @@ -111,8 +111,8 @@ struct Device { }; } // namespace at -std::ostream& operator<<(std::ostream& stream, at::Device::Type type); -std::ostream& operator<<(std::ostream& stream, const at::Device& device); +AT_API std::ostream& operator<<(std::ostream& stream, at::Device::Type type); +AT_API std::ostream& operator<<(std::ostream& stream, const at::Device& device); namespace std { template<> struct hash diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 4ed5d6254320f..e0929784a5bda 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -10,7 +10,7 @@ namespace at { AT_API std::vector infer_size(IntList a, IntList b); -std::tuple, std::vector > inferExpandGeometry( +AT_API std::tuple, std::vector > inferExpandGeometry( IntList tensor_sizes, IntList tensor_strides, IntList sizes); // avoid copy-construction of Tensor by using a reference_wrapper. diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h index 949a18848f726..d740008305cd8 100644 --- a/aten/src/ATen/Half.h +++ b/aten/src/ATen/Half.h @@ -35,8 +35,8 @@ namespace at { namespace detail { -float halfbits2float(unsigned short bits); -unsigned short float2halfbits(float value); +AT_API float halfbits2float(unsigned short bits); +AT_API unsigned short float2halfbits(float value); } diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h index 3a5926a06df8d..238a181296f4f 100644 --- a/aten/src/ATen/SmallVector.h +++ b/aten/src/ATen/SmallVector.h @@ -33,6 +33,8 @@ #include #include +#include + #if __GNUG__ && __GNUC__ < 5 #define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) #else @@ -57,7 +59,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) { } /// This is all the non-templated stuff common to all SmallVectors. -class SmallVectorBase { +class AT_API SmallVectorBase { protected: void *BeginX, *EndX, *CapacityX; diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 081e48a07d722..2093b45339137 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -5,7 +5,7 @@ #include "ATen/Error.h" namespace at { -struct SparseTensorImpl : public TensorImpl { +struct AT_API SparseTensorImpl : public TensorImpl { // Stored in COO format, indices + values. // Ideal INVARIANTS: diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h index c8aeca2bbeb90..53ad9d827c628 100644 --- a/aten/src/ATen/TensorOptions.h +++ b/aten/src/ATen/TensorOptions.h @@ -19,7 +19,7 @@ namespace at { /// `torch::TensorOptions` subclass of this `TensorOptions`, which changes /// `type()` to return a variable type instead of a tensor type, such that /// variables are created inside factory methods, instead of tensors. -struct TensorOptions { +struct AT_API TensorOptions { TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {} /// Constructs the `TensorOptions` with defaults taken from the thread local diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h index 72fc752b717a4..545bccfdfbcb7 100644 --- a/aten/src/ATen/cuda/CUDAStream.h +++ b/aten/src/ATen/cuda/CUDAStream.h @@ -5,6 +5,8 @@ #include "cuda_runtime_api.h" +#include + /* * A CUDA stream interface with no CUDA build dependency. * @@ -23,31 +25,31 @@ namespace detail { // Pointer-based API (for internal use) // Note: ATen/Context is preferred to work with streams safely -CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device); -CUDAStreamInternals* CUDAStream_getDefaultStream(); +AT_API CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getDefaultStream(); -CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority); +AT_API CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority); -CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device); -CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream(); +AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream(); // Note: these Unsafe gets should NEVER be used and are only here for legacy // purposes. Once those uses are gone they should be removed. -CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); -CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); +AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); +AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); -void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); -void CUDAStream_uncheckedSetStreamOnDevice( +AT_API void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); +AT_API void CUDAStream_uncheckedSetStreamOnDevice( int64_t device, CUDAStreamInternals* internals); -void CUDAStream_setStream(CUDAStreamInternals* internals); +AT_API void CUDAStream_setStream(CUDAStreamInternals* internals); -cudaStream_t CUDAStream_stream(CUDAStreamInternals*); -int64_t CUDAStream_device(CUDAStreamInternals*); +AT_API cudaStream_t CUDAStream_stream(CUDAStreamInternals*); +AT_API int64_t CUDAStream_device(CUDAStreamInternals*); -bool CUDAStream_retain(CUDAStreamInternals*); -void CUDAStream_free(CUDAStreamInternals*&); -void CUDAStream_uncheckedFree(CUDAStreamInternals*&); +AT_API bool CUDAStream_retain(CUDAStreamInternals*); +AT_API void CUDAStream_free(CUDAStreamInternals*&); +AT_API void CUDAStream_uncheckedFree(CUDAStreamInternals*&); } // namespace detail @@ -71,10 +73,10 @@ struct CUDAStream { ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); } // Copy constructor - CUDAStream(const CUDAStream& other); + AT_API CUDAStream(const CUDAStream& other); // Move constructor - CUDAStream(CUDAStream&& other); + AT_API CUDAStream(CUDAStream&& other); // Assignment operator CUDAStream& operator=(CUDAStream other) noexcept { diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 7a5e6e40760ec..1c06654fe891b 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -143,7 +143,7 @@ def TypedDict(name, attrs, total=True): # type: ignore """) # add a native declaration for a native function NATIVE_DECLARATION = CodeTemplate("""\ -${return_type} ${native_type_method_dispatch}(${formals_with_defaults}); +AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults}); """) # special method definition for factory functions in Functions.h diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in index 9038dfb2b1092..2a1c33f13679d 100644 --- a/aten/src/TH/THGeneral.h.in +++ b/aten/src/TH/THGeneral.h.in @@ -35,11 +35,14 @@ #ifdef _WIN32 # if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) # define TH_API TH_EXTERNC __declspec(dllexport) +# define TH_CPP_API __declspec(dllexport) # else # define TH_API TH_EXTERNC __declspec(dllimport) +# define TH_CPP_API __declspec(dllimport) # endif #else # define TH_API TH_EXTERNC +# define TH_CPP_API #endif #ifdef _WIN32 diff --git a/aten/src/TH/THStorageClass.hpp b/aten/src/TH/THStorageClass.hpp index 99031f635b703..f9cc84915e319 100644 --- a/aten/src/TH/THStorageClass.hpp +++ b/aten/src/TH/THStorageClass.hpp @@ -37,7 +37,7 @@ struct THFinalizer { virtual ~THFinalizer() {}; }; -struct THStorage +struct TH_CPP_API THStorage { THStorage() = delete; THStorage(at::ScalarType, ptrdiff_t, at::DataPtr, at::Allocator*, char); diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 9ef48dcfbd870..b4b1d4c51e882 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -33,14 +33,14 @@ // If it is not, you must report that the storage is dead. // -ptrdiff_t THStorage_size(const THStorage *self); +TH_API ptrdiff_t THStorage_size(const THStorage *self); -void THStorage_setFlag(THStorage *storage, const char flag); -void THStorage_clearFlag(THStorage *storage, const char flag); -void THStorage_retain(THStorage *storage); -void THStorage_resize(THStorage *storage, ptrdiff_t size); -void THStorage_swap(THStorage *storage1, THStorage *storage2); +TH_API void THStorage_setFlag(THStorage *storage, const char flag); +TH_API void THStorage_clearFlag(THStorage *storage, const char flag); +TH_API void THStorage_retain(THStorage *storage); +TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size); +TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2); -void THStorage_weakRetain(THStorage *weak_storage); -void THStorage_weakFree(THStorage *weak_storage); -THStorage* THStorage_weakLock(THStorage *weak_storage); +TH_API void THStorage_weakRetain(THStorage *weak_storage); +TH_API void THStorage_weakFree(THStorage *weak_storage); +TH_API THStorage* THStorage_weakLock(THStorage *weak_storage); diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 0962803e3821a..7912506cec107 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -149,5 +149,5 @@ inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) } TH_API void THTensor_free(THTensor *self); -at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, - at::IntList newshape); +TH_CPP_API at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, + at::IntList newshape); diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc index c0a85af480740..3bcacd8809bb8 100644 --- a/caffe2/utils/proto_wrap.cc +++ b/caffe2/utils/proto_wrap.cc @@ -1,4 +1,5 @@ #include "caffe2/utils/proto_wrap.h" +#include "caffe2/core/common.h" #include #include @@ -8,7 +9,7 @@ namespace caffe { // Caffe wrapper functions for protobuf's GetEmptyStringAlreadyInited() function // used to avoid duplicated global variable in the case when protobuf // is built with hidden visibility. -const ::std::string& GetEmptyStringAlreadyInited() { +CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() { return ::google::protobuf::internal::GetEmptyStringAlreadyInited(); } @@ -19,7 +20,7 @@ namespace ONNX_NAMESPACE { // ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function // used to avoid duplicated global variable in the case when protobuf // is built with hidden visibility. -const ::std::string& GetEmptyStringAlreadyInited() { +CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() { return ::google::protobuf::internal::GetEmptyStringAlreadyInited(); } @@ -30,7 +31,7 @@ namespace caffe2 { // Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function // used to avoid duplicated global variable in the case when protobuf // is built with hidden visibility. -const ::std::string& GetEmptyStringAlreadyInited() { +CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() { return ::google::protobuf::internal::GetEmptyStringAlreadyInited(); } diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h index 7f3e5f9c05509..ae95bf7197770 100644 --- a/tools/autograd/templates/Functions.h +++ b/tools/autograd/templates/Functions.h @@ -5,6 +5,7 @@ #include #include +#include "torch/csrc/THP_export.h" #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/variable.h" #include "torch/csrc/autograd/saved_variable.h" diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index 734e5b9da8104..db8b3357ac253 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -3,6 +3,7 @@ // Engine implements backpropagation from output variables and their gradients // to "root" variables (variables created by the user with requires_grad=True). +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/input_buffer.h" #include "torch/csrc/autograd/anomaly_mode.h" @@ -24,7 +25,7 @@ struct GraphTask; namespace torch { namespace autograd { // A single instance of this struct should be created through the whole process lifetime. // The worker thread creation logic and Engine's destructor rely on this. -struct Engine { +struct TORCH_API Engine { /// Returns a reference to a static `Engine` instance. static Engine& get_default_engine(); @@ -67,6 +68,6 @@ struct Engine { // allow python_engine to override the default engine when it loads typedef Engine& (*EngineStub)(void); -void set_default_engine_stub(EngineStub stub); +TORCH_API void set_default_engine_stub(EngineStub stub); }} // namespace torch::autograd diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index b65a7063f15af..aea5a03201fd4 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -1,6 +1,7 @@ #pragma once #include "torch/csrc/assertions.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/autograd/edge.h" #include "torch/csrc/autograd/grad_mode.h" #include "torch/csrc/autograd/anomaly_mode.h" @@ -84,7 +85,7 @@ void deleteFunction(Function* function); /// are created in one thread and `C` is created in a new thread, there are *no /// guarantees* w.r.t. the ordering of `C` relative to `A` or `B`. ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -struct Function : std::enable_shared_from_this { +struct TORCH_API Function : std::enable_shared_from_this { public: /// Construct a new `Function` with `num_inputs` inputs and the given /// `next_edges`. sequence_nr is a (currently THE) hint to prioritization diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h index 7c92b42a11b05..ad7a9e52d3757 100644 --- a/torch/csrc/autograd/functions/basic_ops.h +++ b/torch/csrc/autograd/functions/basic_ops.h @@ -1,5 +1,6 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/variable.h" #include "torch/csrc/autograd/symbolic.h" @@ -10,7 +11,7 @@ namespace torch { namespace autograd { -struct Error : public Function { +struct TORCH_API Error : public Function { Error(std::string msg, edge_list&& next_edges) : Function(std::move(next_edges)) , msg(std::move(msg)) {} @@ -24,7 +25,7 @@ struct Error : public Function { }; // Identity in forward, Error in backward. Used to implement @once_differentiable -struct DelayedError : public Function { +struct TORCH_API DelayedError : public Function { DelayedError(std::string msg, int num_inputs) : msg(std::move(msg)) { for (int i = 0; i < num_inputs; i++) @@ -36,7 +37,7 @@ struct DelayedError : public Function { std::string msg; }; -struct GraphRoot : public Function { +struct TORCH_API GraphRoot : public Function { GraphRoot(edge_list functions, variable_list inputs) : Function(std::move(functions)), outputs(std::move(inputs)) {} diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h index a1b7ee74bf8ec..bad48b221eaf0 100644 --- a/torch/csrc/autograd/functions/utils.h +++ b/torch/csrc/autograd/functions/utils.h @@ -1,5 +1,6 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" #include #include #include @@ -18,12 +19,12 @@ using function_constructor = std::function(edge_list&& * Wraps the tensor outputs in variables and creates the grad_fn and sets the * grad_fn if necessary. */ -variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs, - function_constructor ctr); +TORCH_API variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs, + function_constructor ctr); /// Checks that inputs contains exactly `args` items and that the first `required_args` /// items are not nullptr. If not specified, `required_args` defaults to `args`. -void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1); +TORCH_API void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1); struct ComputeRequiresGrad : IterArgs { bool out = false; diff --git a/torch/csrc/autograd/grad_mode.h b/torch/csrc/autograd/grad_mode.h index e7d15446bee1f..ebdc86e74ca70 100644 --- a/torch/csrc/autograd/grad_mode.h +++ b/torch/csrc/autograd/grad_mode.h @@ -1,15 +1,17 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" + namespace torch { namespace autograd { -struct GradMode { +struct TORCH_API GradMode { static bool is_enabled(); static void set_enabled(bool enabled); }; // A RAII, thread local (!) guard that enables or disables grad mode upon // construction, and sets it back to the original value upon destruction. -struct AutoGradMode { +struct TORCH_API AutoGradMode { AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) { GradMode::set_enabled(enabled); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index c3abdfc3b8f2b..dd77dc193ba9b 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -15,6 +15,7 @@ #include #include #include "ATen/ATen.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/cuda/cuda_check.h" #ifdef USE_CUDA #include "ATen/cuda/CUDAContext.h" @@ -163,12 +164,12 @@ enum class ProfilerState { NVTX, // only emit NVTX markers }; -RangeEventList& getEventList(); -void mark(std::string name, bool include_cuda = true); -void pushRange(std::string name); -void popRange(); +TORCH_API RangeEventList& getEventList(); +TORCH_API void mark(std::string name, bool include_cuda = true); +TORCH_API void pushRange(std::string name); +TORCH_API void popRange(); -struct RecordFunction { +struct TORCH_API RecordFunction { explicit RecordFunction(Function* fn); explicit RecordFunction(std::string name); @@ -184,8 +185,8 @@ struct RecordFunction { using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. -void enableProfiler(ProfilerState state); -thread_event_lists disableProfiler(); +TORCH_API void enableProfiler(ProfilerState state); +TORCH_API thread_event_lists disableProfiler(); } // namespace profiler }} // namespace torch::autograd diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index ff5a36ba04c03..61a1d3b3eac17 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -18,7 +18,7 @@ TORCH_API extern const char* ERR_BACKWARD_TWICE; /// A snapshot of a variable at a certain version. A `SavedVariable` stores /// enough information to reconstruct a variable from a certain point in time. -class SavedVariable { +class TORCH_API SavedVariable { public: SavedVariable() = default; SavedVariable(const Variable& variable, bool is_output); diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 2def489e3ae54..6b40eea4a2561 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -77,7 +77,7 @@ struct Function; /// free function instead. To create a view variable, use `make_variable_view`. ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -struct Variable : public at::Tensor { +struct TORCH_API Variable : public at::Tensor { /// Default constructor. Variable() = default; diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h index e0dd63c925ef2..6dd2be9db0e77 100644 --- a/torch/csrc/jit/autodiff.h +++ b/torch/csrc/jit/autodiff.h @@ -1,5 +1,6 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/jit/ir.h" #include @@ -84,11 +85,11 @@ struct Gradient { }; // XXX: When calling this function, graph should have complete type information. // Use the shape analysis pass to fill in the gaps if it doesn't. -Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad); +TORCH_API Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad); // can we take a derivative of this node symbolically? -bool isDifferentiable(Node * n); -bool isDifferentiable(Graph & g); -bool isZero(Value * v); +TORCH_API bool isDifferentiable(Node * n); +TORCH_API bool isDifferentiable(Graph & g); +TORCH_API bool isZero(Value * v); }} diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h index 95758dc9aa392..d0c6212a324a8 100644 --- a/torch/csrc/jit/export.h +++ b/torch/csrc/jit/export.h @@ -15,7 +15,7 @@ namespace torch { namespace jit { // file contents being the raw tensor data. using RawDataExportMap = std::unordered_map; -std::tuple ExportGraph( +TORCH_API std::tuple ExportGraph( const std::shared_ptr& graph, const std::vector& initializers, int64_t onnx_opset_version, @@ -24,7 +24,7 @@ std::tuple ExportGraph( = ::torch::onnx::OperatorExportTypes::ONNX); // For testing purposes -std::string PrettyPrintExportedGraph( +TORCH_API std::string PrettyPrintExportedGraph( const std::shared_ptr& graph, const std::vector & initializers, int64_t onnx_opset_version, diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index affcd38a065c9..d78076ab6484f 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -33,7 +33,7 @@ struct GraphExecutorState { }; struct GraphExecutorImpl; -struct GraphExecutor { +struct TORCH_API GraphExecutor { GraphExecutor() {} GraphExecutor(std::shared_ptr graph, bool optimize = true); // note: if not specified, symbolically_differentiable is computed from the graph. @@ -51,17 +51,17 @@ struct GraphExecutor { // These passes need to run before it is valid to pass to the interpreter // regardless of whether sizes have been specialized or not. -void runRequiredPasses(const std::shared_ptr& g); +TORCH_API void runRequiredPasses(const std::shared_ptr& g); // specialize 'graph' to the types, sizes, and other properties described in spec // this prepares the graph for execution, including running runRequiredPasses, // but the execution only remains valid for tensors whose properties match spec // otherwise running the graph will have undefined results. -void specializeToSpec(const std::shared_ptr& graph, const ArgumentSpec& spec); +TORCH_API void specializeToSpec(const std::shared_ptr& graph, const ArgumentSpec& spec); // apply standard optimizations. if graphMustSupportVariables=false then // then the passes are allowed to modify the graph in ways that make it no longer // work with tensors that have requires_grad=True -void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariables); +TORCH_API void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariables); }} diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h index fa1878d827b53..d593896f2c792 100644 --- a/torch/csrc/jit/import.h +++ b/torch/csrc/jit/import.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -std::shared_ptr ImportIRGraph(const std::string& serialized_graph, std::vector & initializers); +TORCH_API std::shared_ptr ImportIRGraph(const std::string& serialized_graph, std::vector & initializers); }} diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index b61a49be846dc..a0d6a7a7fef50 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -5,6 +5,7 @@ #include #include +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/jit/generated/aten_interned_strings.h" namespace torch { namespace jit { @@ -132,7 +133,7 @@ static const std::string domain_prefix = "org.pytorch."; // A Symbol is like an interned string, but with a little extra // structure; it is namespaced via SymbolNamespace and the resulting // intern pointers support efficient namespace testing. -struct Symbol { +struct TORCH_API Symbol { explicit constexpr Symbol() : value(0) {}; explicit constexpr Symbol(unique_t uniq) : value(uniq) {} diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h index b9085528fbb44..28b3404cf463f 100644 --- a/torch/csrc/jit/interpreter.h +++ b/torch/csrc/jit/interpreter.h @@ -3,6 +3,8 @@ #include #include "ATen/optional.h" +#include "torch/csrc/WindowsTorchApiMacro.h" + namespace at { struct Tensor; } @@ -22,7 +24,7 @@ struct TensorType; struct IValue; using Stack = std::vector; -struct Code { +struct TORCH_API Code { Code() : pImpl(nullptr) {} Code(std::shared_ptr& graph); diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 153828497d1bf..a2f71f702bc9b 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -16,6 +16,7 @@ #include "torch/csrc/utils/python_stub.h" #include "torch/csrc/assertions.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include #include "ATen/ArrayRef.h" @@ -51,9 +52,9 @@ struct Node; // Tensor or an opaque Handle object, as determined by type(). struct Value; -std::ostream& operator<<(std::ostream & out, const Graph & g); -std::ostream& operator<<(std::ostream & out, const Type & t); -std::ostream& operator<<(std::ostream & out, const Node & t); +TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g); +TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); +TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t); // A list of nodes, with inputs and outputs struct Block; @@ -195,7 +196,7 @@ struct Value { bool hasUniqueName() const { return unique_name_ != ""; } - Value* setUniqueName(const std::string & name); + TORCH_API Value* setUniqueName(const std::string & name); std::string uniqueName() const { if (hasUniqueName()) return unique_name_; @@ -813,7 +814,7 @@ struct Block { // to the inputs, nodes, and outputs of this block // value_map is used whenever a node in src references a free variable // in src to look up its corresponding value - void cloneFrom(Block * src, std::function value_map); + TORCH_API void cloneFrom(Block * src, std::function value_map); private: // should only be called in the constructor Node* initOutput(Node* p) { @@ -1069,9 +1070,9 @@ friend struct Block; } // Checks well-formedness and invariants of graph - void lint() const; + TORCH_API void lint() const; // for use in debugger - void dump() const; + TORCH_API void dump() const; ~Graph() { for (const Node * n : all_nodes) @@ -1089,7 +1090,7 @@ friend struct Block; } friend std::ostream& operator<<(std::ostream & out, const Graph & g); - std::shared_ptr copy(); + TORCH_API std::shared_ptr copy(); private: @@ -1338,8 +1339,8 @@ struct PythonOp : public Node { }; // patched in when python bindings are loaded -PythonOp* allocPythonOp(Graph* g); -void setAllocPythonOp(PythonOp* (*v)(Graph* g)); +TORCH_API PythonOp* allocPythonOp(Graph* g); +TORCH_API void setAllocPythonOp(PythonOp* (*v)(Graph* g)); inline Node* Graph::createPythonOp( THPObjectPtr&& pyobj, @@ -1365,6 +1366,6 @@ inline const_graph_node_list_iterator Node::reverseIterator() const { return iterator().reverse(); } -void LintGraph(std::shared_ptr& graph); +TORCH_API void LintGraph(std::shared_ptr& graph); }} // namespace torch::jit diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h index 50ec147090be3..74f0ec95d8bce 100644 --- a/torch/csrc/jit/operator.h +++ b/torch/csrc/jit/operator.h @@ -13,7 +13,7 @@ FunctionSchema parseSchema(const std::string& decl); using OperationCreator = std::function; -struct Operator { +struct TORCH_API Operator { Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr) : schema(std::move(schema)) , op(std::move(op)) @@ -65,7 +65,7 @@ void registerOperator(Operator&& op); // XXX: this function is meant to be used with string literals only! Operator& sig(const char *signature_literal); -struct RegisterOperators { +struct TORCH_API RegisterOperators { RegisterOperators(std::vector operators) { for(Operator& o : operators) { registerOperator(std::move(o)); diff --git a/torch/csrc/jit/passes/batch_mm.h b/torch/csrc/jit/passes/batch_mm.h index efb635d2bbc93..06bb34a2c1eae 100644 --- a/torch/csrc/jit/passes/batch_mm.h +++ b/torch/csrc/jit/passes/batch_mm.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void BatchMM(std::shared_ptr& graph); +TORCH_API void BatchMM(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/canonicalize.h b/torch/csrc/jit/passes/canonicalize.h index 09a98832bda74..1430432d3317b 100644 --- a/torch/csrc/jit/passes/canonicalize.h +++ b/torch/csrc/jit/passes/canonicalize.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -std::shared_ptr Canonicalize(const std::shared_ptr& graph); +TORCH_API std::shared_ptr Canonicalize(const std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.h b/torch/csrc/jit/passes/common_subexpression_elimination.h index 483c573d3d157..64ae4f6bd9ca8 100644 --- a/torch/csrc/jit/passes/common_subexpression_elimination.h +++ b/torch/csrc/jit/passes/common_subexpression_elimination.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void EliminateCommonSubexpression(std::shared_ptr& graph); +TORCH_API void EliminateCommonSubexpression(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/torch/csrc/jit/passes/create_autodiff_subgraphs.h index 758225769e8b5..b76ee82e52970 100644 --- a/torch/csrc/jit/passes/create_autodiff_subgraphs.h +++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.h @@ -8,6 +8,6 @@ struct Graph; // insert GraphExecutor nodes that group together // subgraphs that are differentiable by the jit's autodiff passes // threshold - minimum number of nodes that will appear in a block -void CreateAutodiffSubgraphs(Graph & graph, size_t threshold = 2); +TORCH_API void CreateAutodiffSubgraphs(Graph & graph, size_t threshold = 2); }} diff --git a/torch/csrc/jit/passes/dead_code_elimination.h b/torch/csrc/jit/passes/dead_code_elimination.h index 51c92ab07200d..9ae89f9b56150 100644 --- a/torch/csrc/jit/passes/dead_code_elimination.h +++ b/torch/csrc/jit/passes/dead_code_elimination.h @@ -4,7 +4,7 @@ namespace torch { namespace jit { -void EliminateDeadCode(const std::shared_ptr& graph); -void EliminateDeadCode(Block *block, bool recurse=true); +TORCH_API void EliminateDeadCode(const std::shared_ptr& graph); +TORCH_API void EliminateDeadCode(Block *block, bool recurse=true); }} diff --git a/torch/csrc/jit/passes/decompose_addmm.cpp b/torch/csrc/jit/passes/decompose_addmm.cpp index 85ad0770df77b..1a0dd94f9960d 100644 --- a/torch/csrc/jit/passes/decompose_addmm.cpp +++ b/torch/csrc/jit/passes/decompose_addmm.cpp @@ -1,4 +1,5 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" +#include "torch/csrc/jit/passes/decompose_addmm.h" #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/tensor_conversions.h" diff --git a/torch/csrc/jit/passes/decompose_addmm.h b/torch/csrc/jit/passes/decompose_addmm.h index 500d4a2e69f57..1153dcac04196 100644 --- a/torch/csrc/jit/passes/decompose_addmm.h +++ b/torch/csrc/jit/passes/decompose_addmm.h @@ -9,6 +9,6 @@ namespace torch { namespace jit { // // In the future, if we need more passes like this, we should convert this // into a generic canonicalization pass. -void DecomposeAddmm(const std::shared_ptr& graph); +TORCH_API void DecomposeAddmm(const std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/erase_number_types.h b/torch/csrc/jit/passes/erase_number_types.h index a736346dfffce..5ec43ce575b86 100644 --- a/torch/csrc/jit/passes/erase_number_types.h +++ b/torch/csrc/jit/passes/erase_number_types.h @@ -15,6 +15,6 @@ namespace torch { namespace jit { // - prim::TensorToNum, and prim::NumToTensor nodes are erased. // // The pass assumes that DCE will be called sometime after. -void EraseNumberTypes(const std::shared_ptr& graph); +TORCH_API void EraseNumberTypes(const std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h index 17e730ea67566..1c1bbfe292ff5 100644 --- a/torch/csrc/jit/passes/graph_fuser.h +++ b/torch/csrc/jit/passes/graph_fuser.h @@ -6,6 +6,6 @@ namespace torch { namespace jit { // NB: Be sure to run DCE before fusion, because dead instructions // can prevent fusion opportunities from being exploited. -void FuseGraph(std::shared_ptr& graph); +TORCH_API void FuseGraph(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/inplace_check.h b/torch/csrc/jit/passes/inplace_check.h index cae7805188b7b..3ad296b646898 100644 --- a/torch/csrc/jit/passes/inplace_check.h +++ b/torch/csrc/jit/passes/inplace_check.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void CheckInplace(std::shared_ptr& graph); +TORCH_API void CheckInplace(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/loop_unrolling.h b/torch/csrc/jit/passes/loop_unrolling.h index 4ca1fd761f945..d95a179524743 100644 --- a/torch/csrc/jit/passes/loop_unrolling.h +++ b/torch/csrc/jit/passes/loop_unrolling.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void UnrollLoops(std::shared_ptr& graph); +TORCH_API void UnrollLoops(std::shared_ptr& graph); }} // namespace torch::jit diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h index 104adcb047130..a0a881e3002ed 100644 --- a/torch/csrc/jit/passes/lower_grad_of.h +++ b/torch/csrc/jit/passes/lower_grad_of.h @@ -10,6 +10,6 @@ namespace torch { namespace jit { // outputs = // else: // outputs = undefineds -void LowerGradOf(Graph& graph); +TORCH_API void LowerGradOf(Graph& graph); }} diff --git a/torch/csrc/jit/passes/lower_tuples.h b/torch/csrc/jit/passes/lower_tuples.h index 6dda8c2028bc9..d4eed60c1c145 100644 --- a/torch/csrc/jit/passes/lower_tuples.h +++ b/torch/csrc/jit/passes/lower_tuples.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void LowerTuples(std::shared_ptr& graph); +TORCH_API void LowerTuples(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h index a58d421a458d2..a02be6906542c 100644 --- a/torch/csrc/jit/passes/onnx.h +++ b/torch/csrc/jit/passes/onnx.h @@ -5,7 +5,7 @@ namespace torch { namespace jit { -std::shared_ptr ToONNX(std::shared_ptr& state, ::torch::onnx::OperatorExportTypes operator_export_type); -void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map env); +TORCH_API std::shared_ptr ToONNX(std::shared_ptr& state, ::torch::onnx::OperatorExportTypes operator_export_type); +TORCH_API void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map env); }} diff --git a/torch/csrc/jit/passes/peephole.h b/torch/csrc/jit/passes/peephole.h index 2ab80a221dc09..c2a6af708fcc5 100644 --- a/torch/csrc/jit/passes/peephole.h +++ b/torch/csrc/jit/passes/peephole.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void PeepholeOptimize(std::shared_ptr& graph); +TORCH_API void PeepholeOptimize(std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/remove_expands.cpp b/torch/csrc/jit/passes/remove_expands.cpp index 8c679db8c2e52..f0f591cac59ec 100644 --- a/torch/csrc/jit/passes/remove_expands.cpp +++ b/torch/csrc/jit/passes/remove_expands.cpp @@ -1,3 +1,4 @@ +#include "torch/csrc/jit/passes/remove_expands.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/passes/remove_expands.h b/torch/csrc/jit/passes/remove_expands.h index 06885609f684f..ebcd4634c2f46 100644 --- a/torch/csrc/jit/passes/remove_expands.h +++ b/torch/csrc/jit/passes/remove_expands.h @@ -4,6 +4,6 @@ namespace torch { namespace jit { -void RemoveExpands(const std::shared_ptr& graph); +TORCH_API void RemoveExpands(const std::shared_ptr& graph); }} diff --git a/torch/csrc/jit/passes/shape_analysis.h b/torch/csrc/jit/passes/shape_analysis.h index 4a36406758872..1b38cbbe5739a 100644 --- a/torch/csrc/jit/passes/shape_analysis.h +++ b/torch/csrc/jit/passes/shape_analysis.h @@ -1,8 +1,10 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" + namespace torch { namespace jit { struct Graph; struct ArgumentSpec; -void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec); +TORCH_API void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec); }} diff --git a/torch/csrc/jit/passes/specialize_undef.h b/torch/csrc/jit/passes/specialize_undef.h index c468ceb7e00a9..54b68683eaf17 100644 --- a/torch/csrc/jit/passes/specialize_undef.h +++ b/torch/csrc/jit/passes/specialize_undef.h @@ -11,6 +11,6 @@ namespace torch { namespace jit { // operations generated by the symbolic autodiff code and cleans up // AutogradAdds when possible. Outputs of other nodes are conservatively // marked Unknown and not optimized. -void specializeUndef(Graph & g, const std::vector& defined); +TORCH_API void specializeUndef(Graph & g, const std::vector& defined); }} diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h index 1d3113cd8cddc..23c23a0632b31 100644 --- a/torch/csrc/jit/passes/to_batch.h +++ b/torch/csrc/jit/passes/to_batch.h @@ -11,9 +11,9 @@ class ToBatch { std::unordered_map> batch_map; public: static std::unordered_map> batch_operator_table; - void toBatch(Block* block, Block* res_block); + TORCH_API void toBatch(Block* block, Block* res_block); }; -std::shared_ptr to_batch_graph(std::shared_ptr& graph); -void initRegisterBatchOpsBindings(PyObject* module); +TORCH_API std::shared_ptr to_batch_graph(std::shared_ptr& graph); +TORCH_API void initRegisterBatchOpsBindings(PyObject* module); }} diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index f5b80e1264524..e7f115740c0a2 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -83,7 +83,7 @@ struct SugaredValue : public std::enable_shared_from_this { // most things in the environment are just simple value types // and not special python syntax sugar types -struct SimpleValue : public SugaredValue { +struct TORCH_API SimpleValue : public SugaredValue { SimpleValue(Value * value) : value(value) {} virtual std::string kind() const override { @@ -101,7 +101,7 @@ struct SimpleValue : public SugaredValue { Value* value; }; -struct BuiltinFunction : public SugaredValue { +struct TORCH_API BuiltinFunction : public SugaredValue { BuiltinFunction(const std::string& name, at::optional value) : name(name), value(std::move(value)) {} std::string name; @@ -121,7 +121,7 @@ struct BuiltinFunction : public SugaredValue { }; using Resolver = std::function(const std::string& name)>; -void defineMethodsInModule( +TORCH_API void defineMethodsInModule( Module & m, const std::vector& definitions, const std::vector& resolvers, /* determines how we handle free variables in each definition*/ @@ -129,20 +129,20 @@ void defineMethodsInModule( ); // same as above but parse the definitions from source -void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, std::shared_ptr self); -std::shared_ptr compileFunction(Def def, const Resolver& resolver); +TORCH_API void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, std::shared_ptr self); +TORCH_API std::shared_ptr compileFunction(Def def, const Resolver& resolver); // pack outputs of a function following python rules. If there is a single value return // a SimpleValue, otherwise pack all the values into a Tuple. -std::shared_ptr packOutputs(Graph& g, at::ArrayRef values); -std::vector inlineCallTo(Graph& g, Graph& callee, ArrayRef inputs); -void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what); -void ensureTensors(const SourceRange& range, at::ArrayRef values); +TORCH_API std::shared_ptr packOutputs(Graph& g, at::ArrayRef values); +TORCH_API std::vector inlineCallTo(Graph& g, Graph& callee, ArrayRef inputs); +TORCH_API void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what); +TORCH_API void ensureTensors(const SourceRange& range, at::ArrayRef values); // try to match a list if inputs and keyword 'attributes' to this schema, // if it works return the flat list of positional inputs to the call // if it returns nullopt, then failure_messages contains a good error report -at::optional> tryMatchSchema( +TORCH_API at::optional> tryMatchSchema( const FunctionSchema& schema, const SourceRange& loc, Graph& graph, diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 7b784f093330b..9aa31e0b98bbf 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -953,7 +953,7 @@ void testProto() { proto.set_producer_name("foo"); } -std::string runJITCPPTests() { +TORCH_API std::string runJITCPPTests() { std::stringstream out; testIValue(); testControlFlow(); diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 2b8f32e8034f9..7198dd1aba150 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -2,6 +2,7 @@ #include "torch/csrc/jit/ir.h" #include "torch/csrc/assertions.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/utils/functional.h" #include "torch/csrc/utils/variadic.h" #include "torch/csrc/autograd/function_hook.h" @@ -19,7 +20,7 @@ namespace torch { namespace jit { namespace tracer { using torch::autograd::Variable; using variable_list = std::vector; -struct TracingState : public std::enable_shared_from_this { +struct TORCH_API TracingState : public std::enable_shared_from_this { TracingState(); ~TracingState(); @@ -59,10 +60,10 @@ struct ArgumentStash { return stash.intlists.empty(); } - static void stashIntListElem(const std::string& arg_name, - size_t size, - size_t idx, - const Variable& var); + TORCH_API static void stashIntListElem(const std::string& arg_name, + size_t size, + size_t idx, + const Variable& var); static bool hasIntList(const std::string& arg_name) { return stash.intlists.count(arg_name) > 0; @@ -80,8 +81,8 @@ struct ArgumentStash { }; // Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled. -const std::shared_ptr& getTracingState(); -void setTracingState(std::shared_ptr state); +TORCH_API const std::shared_ptr& getTracingState(); +TORCH_API void setTracingState(std::shared_ptr state); inline bool isTracing() { return static_cast(getTracingState()); @@ -191,11 +192,11 @@ struct PreTraceInfo { Node *n; }; -PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef inputs); -void postRecordTrace(const PreTraceInfo& info, at::ArrayRef outputs); +TORCH_API PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef inputs); +TORCH_API void postRecordTrace(const PreTraceInfo& info, at::ArrayRef outputs); -void recordSourceLocation(Node* n); -void setRecordSourceLocation(void (*v)(Node*)); +TORCH_API void recordSourceLocation(Node* n); +TORCH_API void setRecordSourceLocation(void (*v)(Node*)); // We must record the nodes of inputs before we actually carry out // the operation, because an inplace operation may destroy the information @@ -221,6 +222,6 @@ PreTraceInfo makePreTraceInfo(at::ArrayRef inputs, F ctor) { return info; } -autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim); +TORCH_API autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim); }}} // namespace torch::jit::tracer diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index 18949f28dcddc..fe06ec7833a11 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -2,6 +2,7 @@ #include "torch/csrc/jit/interned_strings.h" #include "torch/csrc/assertions.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include @@ -29,7 +30,7 @@ struct Type; using TypePtr = std::shared_ptr; -struct Type : std::enable_shared_from_this { +struct TORCH_API Type : std::enable_shared_from_this { private: TypeKind kind_; @@ -89,7 +90,7 @@ inline bool operator!=(const Type & lhs, const Type & rhs) { } // This node represents a single Tensor value, with an unknown shape. -struct DynamicType : public Type { +struct TORCH_API DynamicType : public Type { DynamicType() : Type(TypeKind::DynamicType) {} bool operator==(const Type& rhs) const override { @@ -106,7 +107,7 @@ struct DynamicType : public Type { struct TensorType; using TensorTypePtr = std::shared_ptr; // This node represents a single Tensor value with a specific size -struct TensorType : public Type { +struct TORCH_API TensorType : public Type { friend struct Type; TensorType(const at::Tensor& tensor) : Type(TypeKind::TensorType) @@ -185,7 +186,7 @@ struct TensorType : public Type { std::vector strides_; }; -struct ListType : public Type { +struct TORCH_API ListType : public Type { friend struct Type; static const TypeKind Kind = TypeKind::ListType; ListType(TypePtr elem) @@ -211,7 +212,7 @@ struct ListType : public Type { TypePtr elem; }; -struct TupleType : public Type { +struct TORCH_API TupleType : public Type { friend struct Type; TupleType(std::vector elements_) : Type(TypeKind::TupleType) @@ -268,7 +269,7 @@ struct TupleType : public Type { }; // This node represents a Python number value -struct NumberType : public Type { +struct TORCH_API NumberType : public Type { NumberType() : Type(TypeKind::NumberType) {} bool operator==(const Type& rhs) const override { @@ -283,7 +284,7 @@ struct NumberType : public Type { }; // This node represents a Python float number value -struct FloatType : public Type { +struct TORCH_API FloatType : public Type { FloatType() : Type(TypeKind::FloatType) {} bool operator==(const Type& rhs) const override { @@ -301,7 +302,7 @@ struct FloatType : public Type { }; // This node represents a Python int number value -struct IntType : public Type { +struct TORCH_API IntType : public Type { IntType() : Type(TypeKind::IntType) {} bool operator==(const Type& rhs) const override { @@ -319,6 +320,6 @@ struct IntType : public Type { }; -std::ostream& operator<<(std::ostream & out, const Type & t); +TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); }} // namespace torch::jit diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h index 0487e5f7fe077..760e86b8280bb 100644 --- a/torch/csrc/onnx/onnx.h +++ b/torch/csrc/onnx/onnx.h @@ -2,6 +2,7 @@ #include "torch/csrc/onnx/onnx.npb.h" #include "torch/csrc/assertions.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include #include @@ -417,7 +418,7 @@ class ModelProto : public MicroProto { opset_import.emplace_back(ptr); return ptr; } - void dump(std::ostream& stream, size_t indent = 0); + TORCH_API void dump(std::ostream& stream, size_t indent = 0); std::string prettyPrint() { std::stringstream ss; dump(ss, 0);