From 7b9d8916e5dde83bb982e0950617738cbfc25f21 Mon Sep 17 00:00:00 2001
From: Vishwak Srinivasan <cs15btech11043@iith.ac.in>
Date: Fri, 20 Jul 2018 09:12:55 -0700
Subject: [PATCH 01/23] Fix integral type dispatch error message (#9625)

Summary:
This fix will prevent errors like (found in `bincount`)
```
RuntimeError: %s not implemented for '%s'bincounttorch.FloatTensor
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9625

Differential Revision: D8932945

Pulled By: soumith

fbshipit-source-id: 794e3b58d662779402ab318e274661826a5db8b2
---
 aten/src/ATen/Dispatch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 63b2d37258d60..6cd8722316297 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -43,7 +43,7 @@
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
       default:                                                                \
-        AT_ERROR("%s not implemented for '%s'", (NAME), the_type.toString()); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
     }                                                                         \
   }()
 

From f84fdc7866a566d2a1c31d5006b24c6d6e841846 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Fri, 20 Jul 2018 09:36:50 -0700
Subject: [PATCH 02/23] Remove unnecessary functions from StorageDerived.h

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9553

Reviewed By: ezyang

Differential Revision: D8915526

Pulled By: cpuhrsch

fbshipit-source-id: 32013d3aa58a1a68637f99ee619d06e27fadaad6
---
 aten/src/ATen/Storage.h                    | 54 ++++++++-----
 aten/src/ATen/templates/StorageDerived.cpp | 93 ++--------------------
 aten/src/ATen/templates/StorageDerived.h   | 28 +------
 aten/src/ATen/test/scalar_test.cpp         |  4 -
 aten/src/TH/TH.h                           |  2 +-
 aten/src/TH/THAllocator.h                  |  2 +-
 aten/src/TH/THBlas.h                       |  2 +-
 aten/src/TH/THDiskFile.cpp                 |  2 +-
 aten/src/TH/THFilePrivate.h                |  2 +-
 aten/src/TH/THGeneral.cpp                  |  2 +-
 aten/src/TH/THHalf.h                       |  2 +-
 aten/src/TH/THLapack.h                     |  2 +-
 aten/src/TH/THLogAdd.h                     |  2 +-
 aten/src/TH/THRandom.cpp                   |  2 +-
 aten/src/TH/THRandom.h                     |  2 +-
 aten/src/TH/THSize.h                       |  2 +-
 aten/src/TH/THStorageFunctions.h           |  2 +-
 aten/src/TH/THVector.h                     |  2 +-
 aten/src/TH/vector/AVX.cpp                 |  2 +-
 aten/src/TH/vector/AVX.h                   |  2 +-
 aten/src/TH/vector/AVX2.h                  |  2 +-
 aten/src/THC/THCGeneral.h.in               |  2 +-
 22 files changed, 62 insertions(+), 153 deletions(-)

diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index 582a82a0c0aad..3239eb9d54adf 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ATen/Scalar.h"
+#include <TH/THStorageFunctions.hpp>
 
 namespace at {
 
@@ -10,32 +11,43 @@ struct Storage {
   static const char RESIZABLE = 2;
 
   Storage() {}
+  Storage(THStorage* storage)
+      : storage(storage) {}
   Storage(const Storage& other) = delete;
+  virtual ~Storage() {
+    THStorage_free(storage);
+  }
   void operator=(const Storage&) = delete;
 
-  virtual ~Storage() {};
   virtual size_t elementSize() const = 0;
-  virtual size_t size() const = 0;
-  virtual void* data() = 0;
-  virtual const void* data() const = 0;
-  virtual Storage& retain() = 0;
-  virtual Storage& free() = 0;
-  virtual void * unsafeGetTH(bool retain) const = 0;
-
-  virtual Storage& resize(int64_t new_size) = 0;
-
+  size_t size() const {
+    return storage->size;
+  };
+  void* data() {
+    return storage->data_ptr.get();
+  };
+  const void* data() const {
+    return storage->data_ptr.get();
+  };
+  void* unsafeGetTH(bool retain_) const {
+    if (retain_) {
+      THStorage_retain(storage);
+    }
+    return storage;
+  }
+  void retain() {
+    THStorage_retain(storage);
+  }
   virtual Type & type() const = 0;
-  virtual int getDevice() const = 0;
-  virtual const char * toString() const = 0;
-
-  virtual Storage& fill(Scalar value) = 0;
-  virtual Storage& set(size_t ind, Scalar value) = 0;
-  virtual Storage& fast_set(size_t ind, Scalar value) = 0;
-  virtual Scalar get(size_t ind) = 0;
-  virtual Scalar fast_get(size_t ind) = 0;
-
-  virtual void set_flag(char flag) = 0;
-  virtual void clear_flag(char flag) = 0;
+  int getDevice() const {
+    return storage->data_ptr.device().index();
+  }
+  void clear_flag(char flag) {
+    THStorage_clearFlag(storage, flag);
+  }
+
+ protected:
+  THStorage *storage;
 };
 
 } // namespace at
diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
index 83e10b9986d56..28e17e0d0c168 100644
--- a/aten/src/ATen/templates/StorageDerived.cpp
+++ b/aten/src/ATen/templates/StorageDerived.cpp
@@ -11,17 +11,16 @@
 namespace at {
 
 ${Storage}::${Storage}(Context* context):
-    storage(${THStorage}_new(${state})), context(context) {}
+    Storage(${THStorage}_new(${state})), context(context) {}
 
 ${Storage}::${Storage}(Context* context, THStorage* storage):
-    storage(storage), context(context) {}
+    Storage(storage), context(context) {}
 
 ${Storage}::${Storage}(Context* context, size_t storage_size)
-  : storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {}
+  : Storage(${THStorage}_newWithSize(${state,} storage_size)), context(context) {}
 
 ${Storage}::${Storage}(Context* context, size_t size, Allocator* allocator)
-  : storage(nullptr),
-    context(context) {
+  : Storage(nullptr), context(context) {
   storage = ${THStorage}_newWithAllocator(${state,} size, allocator);
   ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE);
 }
@@ -38,7 +37,7 @@ static int getPointerDevice(void* ptr) {
 
 ${Storage}::${Storage}(Context* context,
   void * data, size_t size, const std::function<void(void*)> & deleter)
-  : storage(${THStorage}_newWithDataAndAllocator(${state,}
+  : Storage(${THStorage}_newWithDataAndAllocator(${state,}
       InefficientStdFunctionContext::makeDataPtr(data, deleter,
 #if ${isCUDA}
       Device(kCUDA, getPointerDevice(data))
@@ -47,98 +46,20 @@ static int getPointerDevice(void* ptr) {
 #endif
        ), size,
      /* allocator */ nullptr
-    )),
-    context(context) {
+    )), context(context) {
     ${THStorage}_clearFlag(${state,} storage, TH_STORAGE_RESIZABLE);
 }
 
-${Storage}::~${Storage}() {
-  ${THStorage}_free(${state,} storage);
-}
+${Storage}::~${Storage}() { }
 
 size_t ${Storage}::elementSize() const {
   return sizeof(${ScalarType});
 }
 
-size_t ${Storage}::size() const {
-  return storage->size;
-}
-
-void* ${Storage}::data() {
-  return storage->data_ptr.get();
-}
-
-const void* ${Storage}::data() const {
-  return storage->data_ptr.get();
-}
-
-auto ${Storage}::retain() -> ${Storage}& {
-  ${THStorage}_retain(${state,} storage);
-  return *this;
-}
-
-auto ${Storage}::free() -> ${Storage}& {
-  ${THStorage}_free(${state,} storage);
-  return *this;
-}
-
-void* ${Storage}::unsafeGetTH(bool retain) const {
-  if (retain) {
-    ${THStorage}_retain(${state,} storage);
-  }
-  return storage;
-}
-
-auto ${Storage}::resize(int64_t new_size) -> ${Storage}& {
-  ${THStorage}_resize(${state,} storage, new_size);
-  return *this;
-}
-
-auto ${Storage}::fill(Scalar value) -> ${Storage}& {
-  ${THStorage}_fill(${state,} storage, ${to_th_type}(value.to${ScalarName}()));
-  return *this;
-}
-
-auto ${Storage}::set(size_t ind, Scalar value) -> ${Storage}& {
-  ${THStorage}_set(${state,} storage, ind, ${to_th_type}(value.to${ScalarName}()));
-  return *this;
-}
-
-auto ${Storage}::fast_set(size_t ind, Scalar value) -> ${Storage}& {
-  throw std::runtime_error("unsupported operation 'fast_set'");
-}
-
-auto ${Storage}::get(size_t ind) -> Scalar {
-  // static cast to fix  long -> int64_t issues
-  return static_cast<${ScalarType}>(${to_at_type}(${THStorage}_get(${state,} storage, ind)));
-}
-
-auto ${Storage}::fast_get(size_t ind) -> Scalar {
-  if(${isCUDA})
-    throw std::runtime_error("unsupported operation 'fast_get'");
-  return static_cast<${ScalarType}>(${to_at_type}(storage->unsafe_data<${THScalarType}>()[ind]));
-}
-
-void ${Storage}::set_flag(char flag) {
-  ${THStorage}_setFlag(${state,} storage, flag);
-}
-
-void ${Storage}::clear_flag(char flag) {
-  ${THStorage}_clearFlag(${state,} storage, flag);
-}
-
-int ${Storage}::getDevice() const {
-  return storage->data_ptr.device().index();
-}
-
 Type& ${Storage}::type() const {
   return context->getType(Backend::${Backend},ScalarType::${ScalarName});
 }
 
-const char * ${Storage}::toString() const {
-  return "${Storage}";
-}
-
 const char * ${Storage}::typeString() {
   return "${Type}";
 }
diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
index d97d397c8e7ca..8cfa8c1d01d3b 100644
--- a/aten/src/ATen/templates/StorageDerived.h
+++ b/aten/src/ATen/templates/StorageDerived.h
@@ -21,36 +21,16 @@ struct ${Storage} final : public Storage {
   ${Storage}(Context* context, size_t size, Allocator* allocator);
   ${Storage}(Context* context,
     void * data, size_t size, const std::function<void(void*)> & deleter);
-  virtual ~${Storage}();
-
-  virtual size_t elementSize() const override;
-  virtual size_t size() const override;
-  virtual void* data() override;
-  virtual const void* data() const override;
-  virtual ${Storage}& retain() override;
-  virtual ${Storage}& free() override;
-  virtual void * unsafeGetTH(bool retain) const override;
-
-  virtual ${Storage}& resize(int64_t new_size) override;
-  virtual ${Storage}& fill(Scalar value) override;
-  virtual ${Storage}& set(size_t ind, Scalar value) override;
-  virtual ${Storage}& fast_set(size_t ind, Scalar value) override;
-  virtual Scalar get(size_t ind) override;
-  virtual Scalar fast_get(size_t ind) override;
-
-  virtual void set_flag(char flag) override;
-  virtual void clear_flag(char flag) override;
-
-  virtual Type& type() const override;
-  virtual int getDevice() const override;
-  virtual const char * toString() const override;
+  ~${Storage}();
 
+  size_t elementSize() const final;
+
+  Type& type() const final;
   static const char * typeString();
 
 
 protected:
   friend struct ${Type};
-  THStorage *storage;
   Context* context;
 };
 
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index ccdab08dc8208..2880004555a74 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -88,10 +88,6 @@ TEST_CASE( "scalar test", "[]" ) {
     auto t2 = zeros({4,4}, CUDAFloat);
     cout << &t2 << "\n";
     cout << "AFTER GET TYPE " << &CUDAFloat << "\n";
-    auto s = CUDAFloat.storage(4);
-    REQUIRE( s->get(3).toFloat() == 0.0 );
-    s->fill(7);
-    REQUIRE( s->get(3).toFloat() == 7.0 );
   }
   auto t = ones({4,4});
 
diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h
index 1faf6e52b5a46..46aa394063fbd 100644
--- a/aten/src/TH/TH.h
+++ b/aten/src/TH/TH.h
@@ -1,7 +1,7 @@
 #ifndef TH_INC
 #define TH_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #include "THBlas.h"
 #ifdef USE_LAPACK
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
index 460f23873fff0..578d689400baf 100644
--- a/aten/src/TH/THAllocator.h
+++ b/aten/src/TH/THAllocator.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #ifdef __cplusplus
 #include <ATen/Allocator.h>
diff --git a/aten/src/TH/THBlas.h b/aten/src/TH/THBlas.h
index 5fef0febcd5e9..284cd7c42c12c 100644
--- a/aten/src/TH/THBlas.h
+++ b/aten/src/TH/THBlas.h
@@ -1,7 +1,7 @@
 #ifndef TH_BLAS_INC
 #define TH_BLAS_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
 
diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp
index 258ad2cbca6a4..a806df6ac62eb 100644
--- a/aten/src/TH/THDiskFile.cpp
+++ b/aten/src/TH/THDiskFile.cpp
@@ -1,4 +1,4 @@
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include "THDiskFile.h"
 #include "THFilePrivate.h"
 
diff --git a/aten/src/TH/THFilePrivate.h b/aten/src/TH/THFilePrivate.h
index 93bbaa0287ad8..d493510523f3f 100644
--- a/aten/src/TH/THFilePrivate.h
+++ b/aten/src/TH/THFilePrivate.h
@@ -1,4 +1,4 @@
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #include "THHalf.h"
 
diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp
index 1b89e17dce997..b7575b6c8add1 100644
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@@ -1,4 +1,4 @@
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #ifdef _OPENMP
 #include <omp.h>
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index 0f9807b50263f..5ff85eb2c8f40 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -1,7 +1,7 @@
 #ifndef TH_HALF_H
 #define TH_HALF_H
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include <stdint.h>
 
 /* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
diff --git a/aten/src/TH/THLapack.h b/aten/src/TH/THLapack.h
index 614d15f940f77..614e4becfa65c 100644
--- a/aten/src/TH/THLapack.h
+++ b/aten/src/TH/THLapack.h
@@ -1,7 +1,7 @@
 #ifndef TH_LAPACK_INC
 #define TH_LAPACK_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
 
diff --git a/aten/src/TH/THLogAdd.h b/aten/src/TH/THLogAdd.h
index 9319b8f4643be..60acaedc14ea3 100644
--- a/aten/src/TH/THLogAdd.h
+++ b/aten/src/TH/THLogAdd.h
@@ -1,7 +1,7 @@
 #ifndef TH_LOG_ADD_INC
 #define TH_LOG_ADD_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 TH_API const double THLog2Pi;
 TH_API const double THLogZero;
diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp
index 8755f774f1d79..6477b13fc46b3 100644
--- a/aten/src/TH/THRandom.cpp
+++ b/aten/src/TH/THRandom.cpp
@@ -1,4 +1,4 @@
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include "THRandom.h"
 #include "THGenerator.hpp"
 
diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h
index 5460d330d14a2..3641397635521 100644
--- a/aten/src/TH/THRandom.h
+++ b/aten/src/TH/THRandom.h
@@ -1,7 +1,7 @@
 #ifndef TH_RANDOM_INC
 #define TH_RANDOM_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #define _MERSENNE_STATE_N 624
 #define _MERSENNE_STATE_M 397
diff --git a/aten/src/TH/THSize.h b/aten/src/TH/THSize.h
index 2927f21d51262..c190a310772cf 100644
--- a/aten/src/TH/THSize.h
+++ b/aten/src/TH/THSize.h
@@ -1,7 +1,7 @@
 #ifndef TH_SIZE_INC
 #define TH_SIZE_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include <stddef.h>
 
 // THTensor functions that would work on a THSize if we had such a class in C++,
diff --git a/aten/src/TH/THStorageFunctions.h b/aten/src/TH/THStorageFunctions.h
index ce53827b9f6fc..edd1b6b80c93f 100644
--- a/aten/src/TH/THStorageFunctions.h
+++ b/aten/src/TH/THStorageFunctions.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include "THAllocator.h"
 
 #define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
diff --git a/aten/src/TH/THVector.h b/aten/src/TH/THVector.h
index 8054f648e8a4c..e5684dd5be7b7 100644
--- a/aten/src/TH/THVector.h
+++ b/aten/src/TH/THVector.h
@@ -1,7 +1,7 @@
 #ifndef TH_VECTOR_INC
 #define TH_VECTOR_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include "THMath.h"
 
 #define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
diff --git a/aten/src/TH/vector/AVX.cpp b/aten/src/TH/vector/AVX.cpp
index b39b803c86c69..507eb2276ecf7 100644
--- a/aten/src/TH/vector/AVX.cpp
+++ b/aten/src/TH/vector/AVX.cpp
@@ -6,7 +6,7 @@
 #endif
 
 #include "AVX.h"
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) {
   ptrdiff_t i;
diff --git a/aten/src/TH/vector/AVX.h b/aten/src/TH/vector/AVX.h
index 6fd183c6efd90..c4e47cbfcfc00 100644
--- a/aten/src/TH/vector/AVX.h
+++ b/aten/src/TH/vector/AVX.h
@@ -1,7 +1,7 @@
 #ifndef TH_AVX_H
 #define TH_AVX_H
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include <stddef.h>
 
 TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
diff --git a/aten/src/TH/vector/AVX2.h b/aten/src/TH/vector/AVX2.h
index 1c281d8c50e50..c78cc49d2a9ed 100644
--- a/aten/src/TH/vector/AVX2.h
+++ b/aten/src/TH/vector/AVX2.h
@@ -1,7 +1,7 @@
 #ifndef TH_AVX2_H
 #define TH_AVX2_H
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 
 #include <stdint.h>
 #include <stddef.h>
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
index 427591684ea1f..3ba4ed9719b5e 100644
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@@ -1,7 +1,7 @@
 #ifndef THC_GENERAL_INC
 #define THC_GENERAL_INC
 
-#include "THGeneral.h"
+#include <TH/THGeneral.h>
 #include "THAllocator.h"
 #include "THCThreadLocal.h"
 #undef log10

From bbb30ad4ab86f06f90408fdba2d311088be7d55d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Fri, 20 Jul 2018 09:41:47 -0700
Subject: [PATCH 03/23] Use THTensor/Storage for THVoidTensor/Storage (#9588)

Summary:
Change akin to change for THVoidStorage.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9588

Reviewed By: gchanan

Differential Revision: D8915559

Pulled By: cpuhrsch

fbshipit-source-id: 6cc69df0e29942c62750f990903dfd8e4d344581
---
 torch/csrc/Types.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/torch/csrc/Types.h b/torch/csrc/Types.h
index 20e5ceec5bdc7..8342c7a94b0aa 100644
--- a/torch/csrc/Types.h
+++ b/torch/csrc/Types.h
@@ -12,18 +12,9 @@ template <typename T> struct THPTypeInfo {};
 
 namespace torch {
 
-typedef THFloatStorage THVoidStorage;  // all THXXXStorage types are the same.
-
-typedef struct THVoidTensor
-{
-   int64_t *size;
-   int64_t *stride;
-   int nDimension;
-   THVoidStorage *storage;
-   ptrdiff_t storageOffset;
-   int refcount;
-   char flag;
-} THVoidTensor;
+typedef THStorage THVoidStorage;
+
+typedef THTensor THVoidTensor;
 
 }  // namespace torch
 

From b9e89cf9fdf134affc184d36296bb6ac321aec91 Mon Sep 17 00:00:00 2001
From: Sam Gross <sgross@fb.com>
Date: Fri, 20 Jul 2018 10:18:02 -0700
Subject: [PATCH 04/23] Revert "Extend DispatchStub to support CUDA dispatch
 (#9579)" (#9614)

Summary:
This reverts commit bcf0bf42a1727c8ee788f733c28579d0e36a387c.

The commit was causing issues for some internal FB projects.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9614

Reviewed By: Yangqing

Differential Revision: D8929552

Pulled By: colesbury

fbshipit-source-id: ae9026ad8762a4c5de401273694b4c878fc241a6
---
 .jenkins/pytorch/test.sh                      |  9 ++-
 aten/src/ATen/native/DispatchStub.cpp         | 44 ----------
 aten/src/ATen/native/ReduceOps.cpp            | 11 +--
 aten/src/ATen/native/SoftMax.cpp              | 14 +---
 aten/src/ATen/native/UnaryOps.cpp             | 28 +------
 .../CapabilityDispatch.h}                     | 80 ++++++++-----------
 aten/src/ATen/native/cpu/ReduceOpsKernel.h    |  2 +-
 aten/src/ATen/native/cpu/SoftmaxKernel.h      |  2 +-
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   |  2 +-
 aten/src/ATen/native/cpu/UnaryOpsKernel.h     |  2 +-
 10 files changed, 55 insertions(+), 139 deletions(-)
 delete mode 100644 aten/src/ATen/native/DispatchStub.cpp
 rename aten/src/ATen/native/{DispatchStub.h => cpu/CapabilityDispatch.h} (51%)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 7e5b98ee628cd..e4c553dd4e652 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -44,10 +44,13 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
+export ATEN_DISABLE_AVX=
+export ATEN_DISABLE_AVX2=
 if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
-  export ATEN_CPU_CAPABILITY=default
-elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
-  export ATEN_CPU_CAPABILITY=avx
+  export ATEN_DISABLE_AVX=1
+fi
+if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
+  export ATEN_DISABLE_AVX2=1
 fi
 
 test_python_nn() {
diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp
deleted file mode 100644
index 662ae580c599a..0000000000000
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-#include "DispatchStub.h"
-
-#include <ATen/Error.h>
-
-#include <cpuinfo.h>
-#include <cstdlib>
-#include <cstring>
-
-namespace at { namespace native {
-
-static CPUCapability compute_cpu_capability() {
-  auto envar = std::getenv("ATEN_CPU_CAPABILITY");
-  if (envar) {
-    if (strcmp(envar, "avx2") == 0) {
-      return CPUCapability::AVX2;
-    }
-    if (strcmp(envar, "avx") == 0) {
-      return CPUCapability::AVX;
-    }
-    if (strcmp(envar, "default") == 0) {
-      return CPUCapability::DEFAULT;
-    }
-    AT_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
-  }
-
-#ifndef __powerpc__
-  if (cpuinfo_initialize()) {
-    if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) {
-      return CPUCapability::AVX2;
-    }
-    if (cpuinfo_has_x86_avx()) {
-      return CPUCapability::AVX;
-    }
-  }
-#endif
-  return CPUCapability::DEFAULT;
-}
-
-CPUCapability get_cpu_capability() {
-  static CPUCapability capability = compute_cpu_capability();
-  return capability;
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index d055a91afa759..affa9d24059d9 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -17,9 +17,6 @@
 namespace at {
 namespace native {
 
-DispatchStub<reduce_fn> sum_kernel;
-DispatchStub<reduce_fn> prod_kernel;
-
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
   ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType);
@@ -130,7 +127,7 @@ Tensor sum(const Tensor &self) {
 Tensor _sum_cpu(const Tensor& self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    sum_kernel(kCPU, result, self, at::nullopt);
+    sum_kernel(result, self, at::nullopt);
     return result;
   }
   return self._sumall();
@@ -151,7 +148,7 @@ Tensor prod(const Tensor &self) {
 Tensor _prod_cpu(const Tensor &self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    prod_kernel(kCPU, result, self, at::nullopt);
+    prod_kernel(result, self, at::nullopt);
     return result;
   }
   return self._prodall();
@@ -225,7 +222,7 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    sum_kernel(kCPU, result, self, dim);
+    sum_kernel(result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
@@ -263,7 +260,7 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    prod_kernel(kCPU, result, self, dim);
+    prod_kernel(result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 56e0ab6ca6ba1..546c75829a6d5 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -128,7 +128,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    softmax_lastdim_kernel(kCPU, output, input);
+    softmax_lastdim_kernel(output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] {
       host_softmax<scalar_t, false>(output, input, dim);
@@ -147,7 +147,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    log_softmax_lastdim_kernel(kCPU, output, input);
+    log_softmax_lastdim_kernel(output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] {
       host_softmax<scalar_t, true>(output, input, dim);
@@ -176,7 +176,7 @@ Tensor softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
+    softmax_backward_lastdim_kernel(grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] {
       host_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
@@ -205,7 +205,7 @@ Tensor log_softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
+    log_softmax_backward_lastdim_kernel(grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] {
       host_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
@@ -213,11 +213,5 @@ Tensor log_softmax_backward_cpu(
   }
   return grad_input;
 }
-
-DispatchStub<forward_fn> softmax_lastdim_kernel;
-DispatchStub<forward_fn> log_softmax_lastdim_kernel;
-DispatchStub<backward_fn> softmax_backward_lastdim_kernel;
-DispatchStub<backward_fn> log_softmax_backward_lastdim_kernel;
-
 }
 }
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 17f5a437b0015..dbfc623b0ccba 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -92,14 +92,14 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
   Tensor& _##op##__cpu(Tensor& self_) {                         \
     if (self_.numel() > 0) {                                    \
       Tensor self = sort_strides(self_);                        \
-      op##Impl(kCPU, self, self);                               \
+      op##Impl(self, self);                                     \
     }                                                           \
     return self_;                                               \
   }                                                             \
   Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
     result.resize_(self.sizes());                               \
     if (result.numel() > 0) {                                   \
-      op##Impl(kCPU, result, self);                             \
+      op##Impl(result, self);                                   \
     }                                                           \
     return result;                                              \
   }
@@ -145,29 +145,5 @@ IMPLEMENT_UNARY_OP_VEC(tan)
 IMPLEMENT_UNARY_OP_VEC(tanh)
 IMPLEMENT_UNARY_OP_VEC(trunc)
 
-DispatchStub<unary_fn> absImpl;
-DispatchStub<unary_fn> acosImpl;
-DispatchStub<unary_fn> asinImpl;
-DispatchStub<unary_fn> atanImpl;
-DispatchStub<unary_fn> ceilImpl;
-DispatchStub<unary_fn> cosImpl;
-DispatchStub<unary_fn> erfImpl;
-DispatchStub<unary_fn> erfcImpl;
-DispatchStub<unary_fn> expImpl;
-DispatchStub<unary_fn> expm1Impl;
-DispatchStub<unary_fn> floorImpl;
-DispatchStub<unary_fn> logImpl;
-DispatchStub<unary_fn> log10Impl;
-DispatchStub<unary_fn> log1pImpl;
-DispatchStub<unary_fn> log2Impl;
-DispatchStub<unary_fn> roundImpl;
-DispatchStub<unary_fn> rsqrtImpl;
-DispatchStub<unary_fn> sigmoidImpl;
-DispatchStub<unary_fn> sinImpl;
-DispatchStub<unary_fn> sqrtImpl;
-DispatchStub<unary_fn> tanImpl;
-DispatchStub<unary_fn> tanhImpl;
-DispatchStub<unary_fn> truncImpl;
-
 }
 } // namespace at
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/cpu/CapabilityDispatch.h
similarity index 51%
rename from aten/src/ATen/native/DispatchStub.h
rename to aten/src/ATen/native/cpu/CapabilityDispatch.h
index bbdf07a8458bf..6cb0f279872d6 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/cpu/CapabilityDispatch.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <ATen/Error.h>
-#include <ATen/ScalarType.h>
+#include <cpuinfo.h>
 #include <type_traits>
+#include <iostream>
 
 // Implements instruction set specific function dispatch.
 //
@@ -23,82 +23,72 @@
 //   REGISTER_DISPATCH(stub, &kernel);
 //
 // To call:
-//   stub(kCPU, tensor);
+//   stub(tensor);
 //
 
 namespace at {
 namespace native {
 
-enum class CPUCapability {
-  DEFAULT = 0,
-  AVX = 1,
-  AVX2 = 2,
-  NUM_OPTIONS
-};
-
-CPUCapability get_cpu_capability();
+enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS };
 
 template <typename FnPtr>
 struct DispatchStub {
   static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
 
   template <typename... ArgTypes>
-  void operator()(Backend backend, ArgTypes... args) {
-    if (backend == Backend::CPU) {
-      if (!dispatch_ptr) {
-        dispatch_ptr = choose_cpu_impl();
-      }
-      (*dispatch_ptr)(args...);
-    } else if (backend == Backend::CUDA) {
-      AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
-      (*cuda_dispatch_ptr)(args...);
-    } else {
-      AT_ERROR("DispatchStub: unsupported backend", backend);
+  void operator()(ArgTypes... args) {
+    if (!dispatch_ptr) {
+      dispatch_ptr = choose_impl();
     }
+    (*dispatch_ptr)(args...);
   }
 
-  FnPtr choose_cpu_impl() {
-    int def = static_cast<int>(CPUCapability::DEFAULT);
-    int avx = static_cast<int>(CPUCapability::AVX);
-    int avx2 = static_cast<int>(CPUCapability::AVX2);
-
-    auto capability = static_cast<int>(get_cpu_capability());
-    if (capability >= avx2 && table[avx2]) {
-      return table[avx2];
-    }
-    if (capability >= avx && table[avx]) {
-      return table[avx];
+  FnPtr choose_impl() {
+// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc
+#ifndef __powerpc__
+    if (cpuinfo_initialize()) {
+      int avx2 = static_cast<int>(CPUCapability::AVX2);
+      if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() &&
+          cpuinfo_has_x86_fma3() && table[avx2]) {
+        return table[avx2];
+      }
+      int avx = static_cast<int>(CPUCapability::AVX);
+      if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) {
+        return table[avx];
+      }
     }
+#endif
+    int def = static_cast<int>(CPUCapability::DEFAULT);
     AT_ASSERTM(table[def], "DispatchStub: missing default kernel");
     return table[def];
   }
 
   FnPtr dispatch_ptr = nullptr;
-  FnPtr cuda_dispatch_ptr = nullptr;
   FnPtr table[static_cast<int>(CPUCapability::NUM_OPTIONS)];
 };
 
 
-#if defined(CPU_CAPABILITY) || defined(__CUDACC__)
+#if defined(CPU_CAPABILITY)
 
-namespace {
+constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY;
 
-template <typename FnPtr>
+// Registers an implementation a kernel for the current CPU capability.
+template<typename FnPtr>
 struct RegisterDispatch {
   RegisterDispatch(DispatchStub<FnPtr>& stub, FnPtr value) {
-#if defined(__CUDACC__)
-    stub.cuda_dispatch_ptr = value;
-#else
-    int cap = static_cast<int>(CPUCapability::CPU_CAPABILITY);
-    AT_ASSERT(!stub.table[cap])
-    stub.table[cap] = value;
-#endif
+    stub.table[static_cast<int>(CURRENT_CAPABILITY)] = value;
   }
 };
 
-} // anonymous namespace
+// We only define the stub once in the DEFAULT capability compilation
+#if defined(CPU_CAPABILITY_DEFAULT)
+#define _DEFINE_STUB(stub, fn) DispatchStub<decltype(fn)> stub
+#else
+#define _DEFINE_STUB(stub, fn)
+#endif
 
 #define REGISTER_DISPATCH(stub, fn) \
+  _DEFINE_STUB(stub, fn); \
   static RegisterDispatch<decltype(fn)> stub ## __register(stub, fn);
 
 #endif
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
index 5a7854d0094cd..9481b90fe7696 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <ATen/native/DispatchStub.h>
 #include <ATen/optional.h>
+#include "CapabilityDispatch.h"
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h
index 39d7e68a8b20c..dbd703b6d3c02 100644
--- a/aten/src/ATen/native/cpu/SoftmaxKernel.h
+++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <ATen/native/DispatchStub.h>
+#include "CapabilityDispatch.h"
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 459838a9b6c68..7416923cfd886 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -4,7 +4,7 @@
 #include "ATen/Dispatch.h"
 #include "ATen/cpu/vml.h"
 #include "ATen/CPUApplyUtils.h"
-#include "ATen/native/DispatchStub.h"
+#include "ATen/native/cpu/CapabilityDispatch.h"
 #ifdef __AVX2__
 #include "ATen/native/cpu/avx_mathfun.h"
 #endif
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
index d4845760f7248..d9bffadd1e1fb 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <ATen/native/DispatchStub.h>
 #include <stdexcept>
+#include "CapabilityDispatch.h"
 
 namespace at { namespace native {
 

From 1d4d9fc7da4c09c64fa276a47032291e3d0f77b6 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Fri, 20 Jul 2018 10:49:39 -0700
Subject: [PATCH 05/23] Prepare to stop using attributes in the JIT (#9505)

Summary:
This PR adds machinery to cache the schema in an IR node, and allows lookups of (possibly) constant inputs by their names (instead of position). The new methods are:

- `at::optional<T> get<T>(Symbol name)` - if the argument called name is a constant, then casts it to type `T` and returns it. If it's not constant returns `nullopt`. Raises an error if there's no argument with that name.
- `at::optional<IValue> get<T>(Symbol name)` - like above, but packs the result in an IValue
- `Value* getValue(Symbol name)` - retrieves a `Value*` for an argument (no need to know its position).

All above functions currently inspect the attributes as well, but that's only so that I could start using them in other places in the JIT without disrupting our current functionality. I wanted this diff to be a preparation that doesn't change the semantics too much, and so both the tracer and script create nodes with attributes. The next PR will put that to a stop, and hopefully the changes we need to make to other components will be simpler thanks to what I did here.

One more thing I'd like to do before actually stopping creating the non-attributed nodes is to have a convenient way of creating a schema programmatically, matching nodes against it, and creating them without having to pack inputs into flat argument lists (which is quite error prone).

zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9505

Reviewed By: ezyang

Differential Revision: D8915496

Pulled By: apaszke

fbshipit-source-id: 39d14fc9a9d73d8494f128367bf70357dbba83f5
---
 test/expect/TestJit.test_alexnet.expect       |  37 ++---
 test/expect/TestJit.test_trace_size.expect    |  16 +-
 .../TestJit.test_trace_size_with_grad.expect  |  16 +-
 tools/jit/gen_jit_dispatch.py                 |   1 +
 torch/csrc/jit/autodiff.cpp                   |  53 ++++---
 torch/csrc/jit/function_schema.h              |   2 +-
 torch/csrc/jit/fusion_compiler.cpp            |  24 +--
 torch/csrc/jit/init.cpp                       |   2 +
 torch/csrc/jit/ir.cpp                         | 145 ++++++++++++++++++
 torch/csrc/jit/ir.h                           |  48 +++++-
 torch/csrc/jit/passes/batch_mm.cpp            |  10 +-
 .../common_subexpression_elimination.cpp      |   8 +-
 torch/csrc/jit/passes/decompose_addmm.cpp     |  11 +-
 torch/csrc/jit/passes/graph_fuser.cpp         |  11 +-
 torch/csrc/jit/passes/peephole.cpp            |  45 +++---
 torch/csrc/jit/passes/remove_expands.cpp      |   2 +-
 torch/csrc/jit/passes/shape_analysis.cpp      |  31 ++--
 torch/csrc/jit/symbolic_variable.h            |  17 ++
 torch/csrc/jit/tensor_conversions.h           |   8 +-
 torch/csrc/jit/tracer.cpp                     |   5 +-
 torch/onnx/symbolic.py                        |   4 +-
 21 files changed, 358 insertions(+), 138 deletions(-)

diff --git a/test/expect/TestJit.test_alexnet.expect b/test/expect/TestJit.test_alexnet.expect
index 0d3a5f53fd376..3c71802b9ff26 100644
--- a/test/expect/TestJit.test_alexnet.expect
+++ b/test/expect/TestJit.test_alexnet.expect
@@ -28,22 +28,23 @@ graph(%0 : Double(1, 3, 224, 224)
   %29 : Double(1, 256, 13, 13) = aten::_convolution[stride=[1, 1], padding=[1, 1], dilation=[1, 1], transposed=0, output_padding=[0, 0], groups=1, benchmark=0, deterministic=0, cudnn_enabled=1](%28, %9, %10), scope: AlexNet/Sequential[features]/Conv2d[10]
   %30 : Double(1, 256, 13, 13) = aten::threshold[threshold={0}, value={0}](%29), scope: AlexNet/Sequential[features]/ReLU[11]
   %31 : Double(1, 256, 6, 6), %32 : Long(1, 256, 6, 6) = aten::max_pool2d_with_indices[kernel_size=[3, 3], stride=[2, 2], padding=[0, 0], dilation=[1, 1], ceil_mode=0](%30), scope: AlexNet/Sequential[features]/MaxPool2d[12]
-  %33 : Long() = aten::size[dim=0](%31), scope: AlexNet
-  %34 : Long() = prim::Constant[value={9216}](), scope: AlexNet
-  %35 : Dynamic = aten::stack[dim=0](%33, %34), scope: AlexNet
-  %36 : Double(1, 9216) = aten::view(%31, %35), scope: AlexNet
-  %37 : Double(1, 9216) = ^Dropout(0.5, True, False)(%36), scope: AlexNet/Sequential[classifier]/Dropout[0]
-  %38 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %39 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%12), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %40 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%39, %37, %38), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %41 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%40), scope: AlexNet/Sequential[classifier]/ReLU[2]
-  %42 : Double(1, 4096) = ^Dropout(0.5, True, False)(%41), scope: AlexNet/Sequential[classifier]/Dropout[3]
-  %43 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %44 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%14), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %45 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%44, %42, %43), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %46 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%45), scope: AlexNet/Sequential[classifier]/ReLU[5]
-  %47 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %48 : Double(1, 1000) = aten::expand[size=[1, 1000], implicit=1](%16), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %49 : Double(1, 1000) = aten::addmm[beta={1}, alpha={1}](%48, %46, %47), scope: AlexNet/Sequential[classifier]/Linear[6]
-  return (%49);
+  %33 : Long() = prim::Constant[value={0}](), scope: AlexNet
+  %34 : Long() = aten::size(%31, %33), scope: AlexNet
+  %35 : Long() = prim::Constant[value={9216}](), scope: AlexNet
+  %36 : Dynamic = aten::stack[dim=0](%34, %35), scope: AlexNet
+  %37 : Double(1, 9216) = aten::view(%31, %36), scope: AlexNet
+  %38 : Double(1, 9216) = ^Dropout(0.5, True, False)(%37), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %39 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %40 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%12), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %41 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%40, %38, %39), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %42 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%41), scope: AlexNet/Sequential[classifier]/ReLU[2]
+  %43 : Double(1, 4096) = ^Dropout(0.5, True, False)(%42), scope: AlexNet/Sequential[classifier]/Dropout[3]
+  %44 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %45 : Double(1, 4096) = aten::expand[size=[1, 4096], implicit=1](%14), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %46 : Double(1, 4096) = aten::addmm[beta={1}, alpha={1}](%45, %43, %44), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %47 : Double(1, 4096) = aten::threshold[threshold={0}, value={0}](%46), scope: AlexNet/Sequential[classifier]/ReLU[5]
+  %48 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %49 : Double(1, 1000) = aten::expand[size=[1, 1000], implicit=1](%16), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %50 : Double(1, 1000) = aten::addmm[beta={1}, alpha={1}](%49, %47, %48), scope: AlexNet/Sequential[classifier]/Linear[6]
+  return (%50);
 }
diff --git a/test/expect/TestJit.test_trace_size.expect b/test/expect/TestJit.test_trace_size.expect
index 153104589ca5b..1c6fdcd6eba00 100644
--- a/test/expect/TestJit.test_trace_size.expect
+++ b/test/expect/TestJit.test_trace_size.expect
@@ -1,9 +1,11 @@
 graph(%0 : Double(5, 2, 4)) {
-  %1 : Long() = aten::size[dim=1](%0)
-  %2 : Long() = aten::mul[other={2}](%1)
-  %3 : Long() = aten::size[dim=0](%0)
-  %4 : Long() = prim::Constant[value={2}]()
-  %5 : Dynamic = aten::stack[dim=0](%2, %3, %4)
-  %6 : Double(4, 5, 2) = aten::view(%0, %5)
-  return (%6);
+  %1 : Long() = prim::Constant[value={1}]()
+  %2 : Long() = aten::size(%0, %1)
+  %3 : Long() = aten::mul[other={2}](%2)
+  %4 : Long() = prim::Constant[value={0}]()
+  %5 : Long() = aten::size(%0, %4)
+  %6 : Long() = prim::Constant[value={2}]()
+  %7 : Dynamic = aten::stack[dim=0](%3, %5, %6)
+  %8 : Double(4, 5, 2) = aten::view(%0, %7)
+  return (%8);
 }
diff --git a/test/expect/TestJit.test_trace_size_with_grad.expect b/test/expect/TestJit.test_trace_size_with_grad.expect
index 153104589ca5b..1c6fdcd6eba00 100644
--- a/test/expect/TestJit.test_trace_size_with_grad.expect
+++ b/test/expect/TestJit.test_trace_size_with_grad.expect
@@ -1,9 +1,11 @@
 graph(%0 : Double(5, 2, 4)) {
-  %1 : Long() = aten::size[dim=1](%0)
-  %2 : Long() = aten::mul[other={2}](%1)
-  %3 : Long() = aten::size[dim=0](%0)
-  %4 : Long() = prim::Constant[value={2}]()
-  %5 : Dynamic = aten::stack[dim=0](%2, %3, %4)
-  %6 : Double(4, 5, 2) = aten::view(%0, %5)
-  return (%6);
+  %1 : Long() = prim::Constant[value={1}]()
+  %2 : Long() = aten::size(%0, %1)
+  %3 : Long() = aten::mul[other={2}](%2)
+  %4 : Long() = prim::Constant[value={0}]()
+  %5 : Long() = aten::size(%0, %4)
+  %6 : Long() = prim::Constant[value={2}]()
+  %7 : Dynamic = aten::stack[dim=0](%3, %5, %6)
+  %8 : Double(4, 5, 2) = aten::view(%0, %7)
+  return (%8);
 }
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index d6458f9c2337e..75a14b2d5550f 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -127,6 +127,7 @@ def from_tensor(arg):
 auto result = torch::${name}(${args}, options);
 """)
 
+# TODO (apaszke): remove the attributed codepath once we remove them
 CONSTRUCTOR = CodeTemplate("""\
 [](Node *node) {
   ${kw_assignments}
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index fdeb0ef13a8c3..1b7784916910e 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -11,8 +11,10 @@ namespace torch { namespace jit {
 using value_map = std::unordered_map<Value*, Value*>;
 using value_set = std::unordered_set<Value*>;
 
-bool hasOneValuedAttribute(Node *n, torch::jit::Symbol name) {
-  return n->hasAttribute(name) && at::Scalar(n->t(name)).toDouble() == 1.0;
+bool hasOneValuedInput(Node *n, torch::jit::Symbol name) {
+  auto maybe_t = n->get<at::Tensor>(name);
+  if (!maybe_t) return false;
+  return at::Scalar(*maybe_t).toDouble() == 1.0;
 }
 
 bool isDifferentiable(Node * n) {
@@ -28,7 +30,7 @@ bool isDifferentiable(Node * n) {
   if (n->kind() == aten::addmm) {
     if (n->inputs().size() > 3)
       return false;
-    if (!hasOneValuedAttribute(n, attr::alpha) || !hasOneValuedAttribute(n, attr::beta))
+    if (!hasOneValuedInput(n, attr::alpha) || !hasOneValuedInput(n, attr::beta))
       return false;
   }
   if (n->kind() == aten::type_as && !n->inputs().at(1)->isTensor()) {
@@ -65,7 +67,8 @@ bool outputRequiresGrad(Node* node, std::function<bool(Value*)> requires_grad) {
     case aten::eq:
       return false;
     case aten::type_as:
-    //type_as has two inputs, the second of which (setting type) might require grad, but it still won't affect the output of type_as requiring grad.
+      // type_as has two inputs, the second of which (setting type) might require grad,
+      // but it still won't affect the output of type_as requiring grad.
       return requires_grad(node->inputs().at(0));
     default:
       return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad);
@@ -80,23 +83,30 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     auto outputs = fmap<SymbolicVariable>(node->outputs());
     switch(node->kind()) {
       case aten::add:
-        // o = a - alpha*other
-        if(inputs.size() == 1)
+        // TODO (apaszke): remove formulas for attributed nodes once they are removed
+        // o = self + alpha*other
+        if(inputs.size() == 1) {
           return { grads.at(0) };
-          // o = a + alpha*b
-        return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha)) };
+        } else if (node->hasAttribute(attr::alpha)) {
+          return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha))};
+        } else {
+          return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)};
+        }
       case aten::sub:
-        // o = a - alpha*other
-        if(inputs.size() == 1)
+        // o = self - alpha*other
+        if(inputs.size() == 1) {
           return {grads.at(0)};
-        // o = a - alpha*b
-        return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))};
+        } else if (node->hasAttribute(attr::alpha)) {
+          return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))};
+        } else {
+          return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)};
+        }
       case aten::mul:
-        // o = a * other
+        // o = self * other
         if(inputs.size() == 1)
           return {grads.at(0) * at::Scalar(node->t(attr::other))};
-        // o = a * b
-        return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
+        else
+          return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
       case prim::Constant:
         return {};
       case aten::sigmoid:
@@ -109,17 +119,18 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         return {grads.at(0) * (outputs.at(0))};
       case aten::chunk:
       case aten::split:
-        return {SymbolicVariable::cat(grads, node->i(attr::dim))};
+        return {SymbolicVariable::cat(grads, node->getValue(attr::dim))};
       case aten::t:
         return {grads.at(0).t()};
       case aten::neg:
         return {-grads.at(0)};
       case aten::view:
+        // TODO: if sizes are not available statically, add an operator that reutrns them as a tuple
         return {grads.at(0).view(inputs.at(0).sizes())};
       case aten::type_as:
         return {grads.at(0).type_as(inputs.at(0))};
       case aten::unsqueeze:
-        return {grads.at(0).squeeze(node->i(attr::dim))};
+        return {grads.at(0).squeeze(node->getValue(attr::dim))};
       case aten::mm: {
         SymbolicVariable dmat1, dmat2;
         if (auto type = inputs.at(0).value()->type()->cast<TensorType>()) {
@@ -148,7 +159,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         const auto& input_sizes = inputs.at(0).sizes();
         if (input_sizes.size() == 0)
           return {grads.at(0).sum()};
-        auto grad_sizes = node->is(attr::size);
+        auto grad_sizes = node->get<std::vector<int64_t>>(attr::size).value();
         auto grad = grads.at(0);
         while (grad_sizes.size() > input_sizes.size()) {
           grad = grad.sum(0, false);
@@ -163,6 +174,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
       }
       case aten::squeeze: {
         const auto& sizes = inputs.at(0).sizes();
+        // TODO (apaszke): need to select the right overload here
         if (node->hasAttribute(attr::dim)) {
           int dim = node->i(attr::dim);
           return {sizes.at(dim) > 1 ? grads.at(0) : grads.at(0).unsqueeze(dim)};
@@ -179,11 +191,12 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         }
       }
       case aten::cat: {
-        int dim = node->i(attr::dim);
+        int dim = node->get<int64_t>(attr::dim).value();
         const auto& first_sizes = inputs.at(0).sizes();
         const auto has_first_sizes = [&first_sizes](SymbolicVariable var) {
           return var.sizes() == first_sizes;
         };
+        // TODO (apaszke): This will need an adjustment for the dim argument
         // NB: this is a specialization for the common case where all inputs are
         // of equal sizes. We can use a single split operation to handle that.
         if (std::all_of(inputs.begin(), inputs.end(), has_first_sizes)) {
@@ -339,6 +352,8 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
     value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad));
     JIT_ASSERT(grad_inputs.size() == node->inputs().size());
     for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) {
+      if (!requires_grad(inputs[i])) continue;
+      JIT_ASSERT(grad_inputs[i]);
       set_grad(inputs[i], grad_inputs[i]);
     }
   }
diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h
index 13c81dc296cf5..99222e88af25f 100644
--- a/torch/csrc/jit/function_schema.h
+++ b/torch/csrc/jit/function_schema.h
@@ -1,6 +1,6 @@
 #pragma once
 #include "ATen/ATen.h"
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/type.h"
 
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 78087f8d3118c..889a8c3fcfacc 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -3,6 +3,7 @@
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/code_template.h"
 #include "torch/csrc/jit/resource_guard.h"
+#include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/utils/disallow_copy.h"
 #include "torch/csrc/variable_tensor_functions.h"
 
@@ -125,9 +126,9 @@ void ${kernelName}(IndexType totalElements, void ** args) {
 // This snippet enables half support in the jit. Following the pattern for
 // reductions, fp16 input data is immediately upconverted to float
 // with __half2float(). All mathematical operations are done on float
-// values, and if needed the intermediate float representation is 
+// values, and if needed the intermediate float representation is
 // converted to half with __float2half() when writing to a half tensor.
-constexpr auto half_support_literal  = R"(    
+constexpr auto half_support_literal  = R"(
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
 #if defined(__cplusplus)
@@ -169,7 +170,7 @@ size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
 ${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
 )");
 
-void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
+static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
   TemplateEnv env;
   env.s("tensor",tensor);
   out << format("IndexType ${tensor}_offset = 0;\n",env);
@@ -186,14 +187,14 @@ void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, b
   }
 }
 
-std::string valueName(Value * n) {
+static std::string valueName(Value * n) {
   return "n" + std::to_string(n->unique());
 }
 
-std::string scalarValue(const at::Tensor & t) {
+static std::string scalarValue(const at::Tensor & t) {
   auto s =  at::Scalar(t);
-  if (s.isIntegral()){ 
-    return std::to_string(s.toLong()); 
+  if (s.isIntegral()){
+    return std::to_string(s.toLong());
   } else {
      std::ostringstream out;
      out << std::scientific << s.toDouble() << "f";
@@ -201,7 +202,7 @@ std::string scalarValue(const at::Tensor & t) {
   }
 }
 
-const char * scalarTypeName(at::ScalarType type) {
+static const char * scalarTypeName(at::ScalarType type) {
   if (type == at::ScalarType::Half) {
     return "half";
   }
@@ -291,8 +292,9 @@ std::string encodeRHS(Node * n) {
   TemplateEnv env;
   size_t i = 0;
   for(auto in : n->inputs()) {
-    env.s(std::to_string(i++),valueName(in));
+    env.s(std::to_string(i++), valueName(in));
   }
+  // TODO (apaszke): remove once we get rid of attributes
   // ops like div have a / b or a / 2 with the constant having the attribute other
   // so we add other as an input if it is present
   // 'pow' is the same but uses exponent as the attribute, so we handle that here as well
@@ -356,7 +358,7 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
       } else {
         auto cat = o->node();
         size_t nInputs = cat->inputs().size();
-        concat_desc.emplace_back(desc, nInputs, cat->i(attr::dim));
+        concat_desc.emplace_back(desc, nInputs, cat->get<int64_t>(attr::dim).value());
         for(auto c : cat->inputs()) {
           emitFormal(c, *concat_desc.back().subtensorDesc);
           flat_output_nodes.push_back(c);
@@ -381,7 +383,7 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
     } else {
       env.s("access", format("t${formal}.data[t${formal}_offset]", env));
     }
-    
+
     //TODO: actual type propagation rather than relying on auto..
     body << format("auto ${node} = ${access};\n",env);
   }
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index e1b9ac512cdb7..527a53795fb90 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -6,6 +6,7 @@
 #include "torch/csrc/jit/python_arg_flatten.h"
 #include "torch/csrc/jit/export.h"
 #include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/passes/remove_expands.h"
 #include "torch/csrc/jit/passes/graph_fuser.h"
 #include "torch/csrc/jit/passes/onnx.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
@@ -68,6 +69,7 @@ void initJITBindings(PyObject *module) {
      auto tensor_inputs = createVariableTensorList(inputs);
      PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs));
    })
+   .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
    .def("_jit_pass_loop_unrolling", UnrollLoops)
    .def("_jit_run_cpp_tests", [] {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index a340ddec6fc23..39de66680b9ca 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -1,5 +1,7 @@
 #include "ir.h"
 
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/operator.h"
 #include "torch/csrc/autograd/function.h"
 
 #include <iostream>
@@ -566,6 +568,149 @@ Value* Value::setUniqueName(const std::string & name) {
   return this;
 }
 
+template<typename T>
+Value* Graph::insertConstant(T value) {
+  Node *n = create(prim::Constant);
+  insertNode(n);
+  auto t_value = as_tensor(value);
+  n->t_(attr::value, t_value.clone());
+  n->output()->inferTypeFrom(t_value);
+  return n->output();
+}
+
+// This is necessary, because integral literals are of type int by default,
+// and will dispatch to this function.
+template<>
+Value * Graph::insertConstant(int value) {
+  return insertConstant(static_cast<int64_t>(value));
+}
+
+template Value* Graph::insertConstant(int64_t value);
+template Value* Graph::insertConstant(double value);
+template Value* Graph::insertConstant(at::Tensor value);
+template Value* Graph::insertConstant(at::IntList value);
+template Value* Graph::insertConstant(at::Scalar value);
+
+namespace {
+
+// Of course any sane person would define this thing as a templated function, but
+// it so happens that clang 3.8 has a pretty annoying bug which makes it complain that
+// specializations are redefinitions of themselves, and so here we are.
+template<typename T>
+struct getattr {};
+
+template<>
+struct getattr<int64_t> {
+  int64_t operator()(Node *n, Symbol name) {
+    return n->i(name);
+  }
+};
+
+template<>
+struct getattr<double> {
+  double operator()(Node *n, Symbol name) {
+    return n->f(name);
+  }
+};
+
+template<>
+struct getattr<at::Tensor> {
+  at::Tensor operator()(Node *n, Symbol name) {
+    return n->t(name);
+  }
+};
+
+template<>
+struct getattr<std::vector<int64_t>> {
+  std::vector<int64_t> operator()(Node *n, Symbol name) {
+    return n->is(name);
+  }
+};
+
+} // anonymous namespace
+
+template<typename T>
+at::optional<T> Node::get(Symbol name) {
+  // TODO (apaszke): remove. this is in here for now just so that we can ensure
+  // we always use this in places where the node has a valid schema already
+  // (will make next commits easier).
+  if (!schema_) findSchema();
+  // TODO (apaszke): remove once tracer and compiler stop emitting attributes
+  if (hasAttributes()) {
+    // If it has an attribute, then it is a constant. If it's missing, it means we're
+    // doing an invalid lookup and it should throw anyway.
+    return getattr<T>()(this, name);
+  }
+  auto inp = findInput(name);
+  Node *producer = inp.first->node();
+  if (producer->kind() != prim::Constant) return at::nullopt;
+  auto value = producer->t(attr::value);
+  return tensor_as<T>(std::move(value));
+}
+
+template at::optional<int64_t> Node::get(Symbol name);
+template at::optional<double> Node::get(Symbol name);
+template at::optional<at::Tensor> Node::get(Symbol name);
+template at::optional<std::vector<int64_t>> Node::get(Symbol name);
+
+at::optional<IValue> Node::get(Symbol name) {
+  // TODO (apaszke): remove once tracer and compiler stop emitting attributes
+  if (hasAttributes()) {
+    throw std::runtime_error("IValue Node::get() not implemented for the attribute case");
+  }
+  auto inp = findInput(name);
+  Node * producer = inp.first->node();
+  if (producer->kind() != prim::Constant) return at::nullopt;
+  auto value = producer->t(attr::value);
+  const Argument & arg = inp.second;
+  if (arg.type->isSubtypeOf(*DynamicType::get())) {
+    return IValue{std::move(value)};
+  } else if (arg.type->isSubtypeOf(*IntType::get())) {
+    return IValue{tensor_as<int64_t>(std::move(value))};
+  } else if (arg.type->isSubtypeOf(*FloatType::get())) {
+    return IValue{tensor_as<double>(std::move(value))};
+  }
+  throw std::runtime_error("Unsupported case in Node::get! File a bug report.");
+}
+
+Value* Node::getValue(Symbol name) {
+  // TODO (apaszke): remove once tracer and compiler stop emitting attributes
+  if (hasAttribute(name)) {
+    switch (kindOf(name)) {
+      case AttributeKind::i:
+        return owningGraph()->insertConstant(i(name));
+      case AttributeKind::is:
+        return owningGraph()->insertConstant(is(name));
+      case AttributeKind::t:
+        return owningGraph()->insertConstant(t(name));
+      default:
+        throw std::runtime_error("getValue() NYI");
+    }
+  }
+  return findInput(name).first;
+}
+
+std::pair<Value*, const Argument&> Node::findInput(Symbol name) {
+  if (!schema_) {
+    findSchema();
+  }
+  auto name_str = name.toUnqualString();
+  size_t input_i = 0;
+  for (size_t i = 0; i < schema_->arguments.size(); ++i) {
+    const auto & arg = schema_->arguments[i];
+    if (hasAttributeS(arg.name)) continue;
+    if (arg.name == name_str) {
+      return std::pair<Value*, const Argument&>(input(input_i), arg);
+    }
+    input_i++;
+  }
+  throw std::runtime_error(std::string("Couldn't find an argument called ") + name.toQualString());
+}
+
+void Node::findSchema() {
+  schema_ = &getOperatorFor(this).schema;
+}
+
 PythonOp* defaultAllocPythonOp(Graph*g) {
   throw std::runtime_error("Trying to allocate a Python object without python bindings loaded");
 }
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 2b555029f16b5..3595b83aefda6 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -6,6 +6,8 @@
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/jit/resource_guard.h"
 #include "torch/csrc/jit/source_location.h"
+#include "torch/csrc/jit/function_schema.h"
+#include "torch/csrc/jit/ivalue.h"
 #include "torch/csrc/jit/type.h"
 
 #include "torch/csrc/utils/disallow_copy.h"
@@ -176,11 +178,7 @@ struct Value {
   std::string unique_name_;
   TypePtr type_;
 public:
-  Value* setType(const TypePtr type) {
-    JIT_ASSERT(type);
-    type_ = type;
-    return this;
-  }
+  Value* setType(const TypePtr type);
   void inferTypeFrom(const at::Tensor& output) {
     setType(std::make_shared<TensorType>(output));
   }
@@ -287,6 +285,11 @@ struct Node : public Attributes<Node> {
   std::shared_ptr<SourceLocation> source_location_;
   size_t stage_;
   Scope* scope_;
+  // Assumes FunctionSchemas are persistent, so we don't manage their lifetime.
+  // This field is effective a cache that's populated on attribute lookups and
+  // invalidated every time we perform an operation that could potentially change
+  // the schema.
+  const FunctionSchema* schema_;
 protected:
   Node(Graph * graph_, NodeKind kind_); //defined after graph
 public:
@@ -370,7 +373,7 @@ struct Node : public Attributes<Node> {
       outputs()[i]->replaceAllUsesWith(n->outputs()[i]);
     }
   }
-  // lots of things like chunk have a single input or singel output, so we have a
+  // lots of things like chunk have a single input or single output, so we have a
   // helper to make accessing it easier
   Value * input() {
     JIT_ASSERT(inputs_.size() == 1);
@@ -392,6 +395,11 @@ struct Node : public Attributes<Node> {
     return inputs_.at(i);
   }
 
+  template<typename T>
+  at::optional<T> get(Symbol name);
+  at::optional<IValue> get(Symbol name);
+  Value* getValue(Symbol name);
+
   // Graphs
 
   // Note [Topological invariant]
@@ -414,6 +422,7 @@ struct Node : public Attributes<Node> {
   // Result:  %3 = f(%1, %2, %4)
   Value* addInput(Value * node) {
     JIT_ASSERT(graph_ == node->owningGraph());
+    schema_ = nullptr;
     node->uses_.emplace_back(this, inputs_.size());
     inputs_.push_back(node);
     return node;
@@ -423,6 +432,7 @@ struct Node : public Attributes<Node> {
   // arguments. Returns the added node for ease of chaining.
   Value* insertInput(size_t i, Value* node) {
     JIT_ASSERT(graph_ == node->owningGraph());
+    schema_ = nullptr;
     // First we update the offsets for all existing inputs that will reside
     // after the one we're inserting. Concretely, these are the inputs at
     // indices [i, # input). Since we're inserting one input before all of
@@ -447,6 +457,7 @@ struct Node : public Attributes<Node> {
   // Result:  %3 = f(%1, %4)
   Value * replaceInput(size_t i, Value * newValue) {
     JIT_ASSERT(newValue->owningGraph() == graph_);
+    schema_ = nullptr;
     Value * old = dropInput(i);
     inputs_[i] = newValue;
     newValue->uses_.emplace_back(this, i);
@@ -462,6 +473,7 @@ struct Node : public Attributes<Node> {
   void replaceInputWith(Value * from, Value * to) {
     JIT_ASSERT(from->owningGraph() == graph_);
     JIT_ASSERT(to->owningGraph() == graph_);
+    schema_ = nullptr;
     size_t i = 0;
     for(auto input : inputs()) {
       if(input == from)
@@ -472,10 +484,12 @@ struct Node : public Attributes<Node> {
 
   Value* addOutput() {
     outputs_.push_back(new Value(this, outputs_.size()));
+    schema_ = nullptr;
     return outputs_.back();
   }
 
   Value* insertOutput(size_t i) {
+    schema_ = nullptr;
     outputs_.insert(outputs_.begin() + i, new Value(this, i));
     for (size_t itr = i + 1; itr < outputs_.size(); ++itr) {
       outputs_[itr]->setOffset(outputs_[itr]->offset() + 1);
@@ -585,6 +599,7 @@ struct Node : public Attributes<Node> {
   // Execute: %3.removeInput(1)
   // Result: %3 = f(%1)
   void removeInput(size_t i) {
+    schema_ = nullptr;
     dropInput(i);
     // everything after this input shifts left,
     // so we need to update their use offsets to match
@@ -601,6 +616,7 @@ struct Node : public Attributes<Node> {
   // Execute: %3.removeAllInputs()
   // Result: %3 = f()
   void removeAllInputs() {
+    schema_ = nullptr;
     for(size_t i = 0; i < inputs().size(); ++i)
       dropInput(i);
     inputs_.clear();
@@ -643,6 +659,8 @@ struct Node : public Attributes<Node> {
 
   virtual ~Node() {}
 private:
+  std::pair<Value*, const Argument&> findInput(Symbol name);
+  void findSchema();
   // Lookup iterator in use list of _input i_ that corresponds to its use of _this_
   use_list::iterator findUseForInput(size_t i) {
     auto & input_uses = inputs_[i]->uses_;
@@ -996,6 +1014,9 @@ friend struct Block;
     return r;
   }
 
+  template<typename T>
+  Value * insertConstant(T value);
+
   Node * appendNode(Node * n) {
     return block_->appendNode(n);
   }
@@ -1115,6 +1136,15 @@ inline Value::Value(Node * node_, size_t offset_)
   node_->graph_->all_values.emplace(this);
 }
 
+inline Value* Value::setType(const TypePtr type) {
+  JIT_ASSERT(type);
+  type_ = type;
+  for (Use & use : uses_) {
+    use.user->schema_ = nullptr;
+  }
+  return this;
+}
+
 inline Graph * Value::owningGraph() {
   return node()->owningGraph();
 }
@@ -1142,13 +1172,15 @@ inline Node::Node(Graph * graph_, NodeKind kind_) :
   graph_(graph_),
   owning_block_(nullptr),
   stage_(graph_->new_node_stage_),
-  scope_(graph_->current_scope_) {
+  scope_(graph_->current_scope_),
+  schema_(nullptr) {
   graph_->all_nodes.emplace(this);
 }
 
 inline void Node::eraseOutput(size_t i) {
   JIT_ASSERT(i < outputs_.size());
   JIT_ASSERT(outputs_[i]->uses().size() == 0);
+  schema_ = nullptr;
   Value * n = outputs_[i];
   outputs_.erase(outputs_.begin() + i);
   owningGraph()->freeValue(n);
@@ -1158,12 +1190,14 @@ inline void Node::eraseOutput(size_t i) {
 }
 
 inline Block * Node::addBlock() {
+  schema_ = nullptr;
   blocks_.push_back(new Block(owningGraph(), this));
   return blocks_.back();
 }
 
 inline void Node::eraseBlock(size_t i) {
   JIT_ASSERT(i < blocks_.size());
+  schema_ = nullptr;
   Block * n = blocks_[i];
   blocks_.erase(blocks_.begin() + i);
   n->destroy();
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 15926fdab850a..0d182bd8fd37c 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -146,7 +146,9 @@ void BatchMMBlock(Block* block) {
   // Look for trees in the block
   std::unordered_map<Node*, TreeToken> tokens;
   for (auto node : block->nodes()) {
-    if (node->kind() == aten::mm) {
+    if (node->kind() == aten::mm &&
+        node->input(0)->type()->cast<TensorType>() &&
+        node->input(1)->type()->cast<TensorType>()) {
       tokens[node] = TreeToken::fromMM(node);
     } else if (node->kind() == aten::add) {
       // NOTE: x + 2 is add[other={2}](%x)
@@ -187,9 +189,9 @@ void BatchMMBlock(Block* block) {
       cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes
 
       auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; });
-      Node *cat = graph->create(aten::cat, inputs)
-                       ->i_(attr::dim, cat_dim);
-      cat->insertBefore(root.node);
+      WithInsertPoint iguard { root.node };
+      inputs.push_back(graph->insertConstant(cat_dim));
+      Node *cat = graph->insertNode(graph->create(aten::cat, inputs));
       cat->output()->setType(type->withSizes(cat_sizes));
       return cat->output();
     };
diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.cpp b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
index f6164024f2365..e05f986e2a8bd 100644
--- a/torch/csrc/jit/passes/common_subexpression_elimination.cpp
+++ b/torch/csrc/jit/passes/common_subexpression_elimination.cpp
@@ -87,21 +87,15 @@ struct EqualNodeCSE {
     if (lhs == nullptr && rhs == nullptr) return true;
     if (lhs == nullptr || rhs == nullptr) return false;
 
-    // Check whether two nodes are the same kind.
     if (lhs->kind() != rhs->kind()) return false;
-
-    // Check the stage.
     if (lhs->stage() != rhs->stage()) return false;
 
     // Check whether the inputs are the same.
     auto lhs_inputs = lhs->inputs();
     auto rhs_inputs = rhs->inputs();
-
     if (lhs_inputs.size() != rhs_inputs.size()) return false;
-
     if (!std::equal(lhs_inputs.begin(), lhs_inputs.end(), rhs_inputs.begin())) return false;
 
-    // Check the attributes.
     if (!attributesEqualCSE(lhs, rhs)) return false;
 
     return true;
@@ -117,7 +111,7 @@ void EliminateCommonSubexpression(Block * block) {
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++ it) {
     auto node = *it;
     if (node->kind() == prim::PythonOp
-        || node->kind() == prim::Eval
+        || node->kind() == prim::Print
         || node->blocks().size() > 0
        ) {
       // Do NOT have enough information to do CSE on these nodes.
diff --git a/torch/csrc/jit/passes/decompose_addmm.cpp b/torch/csrc/jit/passes/decompose_addmm.cpp
index de9a3f666b7f7..85ad0770df77b 100644
--- a/torch/csrc/jit/passes/decompose_addmm.cpp
+++ b/torch/csrc/jit/passes/decompose_addmm.cpp
@@ -1,5 +1,6 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/tensor_conversions.h"
 
 namespace torch { namespace jit {
 
@@ -14,12 +15,10 @@ static void DecomposeAddmm(Block* block) {
     // shape analysis and differentiation passes for those two individual ops.
     // Later, we will fuse together those two ops into a single addmm.
     if (it->kind() == aten::addmm && it->inputs().size() == 3) {
-      auto alpha = at::Scalar(it->t(attr::alpha));
-      auto beta = at::Scalar(it->t(attr::beta));
-
-      if (alpha.to<double>() != 1.0 || beta.to<double>() != 1.0) {
-        continue;
-      }
+      auto alpha = it->get<at::Tensor>(attr::alpha);
+      auto beta = it->get<at::Tensor>(attr::beta);
+      if (!alpha || !beta) continue;
+      if (tensor_as<double>(*alpha) != 1.0 || tensor_as<double>(*beta) != 1.0) continue;
 
       WithInsertPoint guard(*it);
 
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 712c88d0548b8..6b98e7f1a4695 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -5,7 +5,7 @@
 
 #ifdef USE_CUDA
   #include "cuda.h" // for CUDA_VERSION
-#endif 
+#endif
 
 namespace torch { namespace jit {
 
@@ -136,13 +136,13 @@ struct GraphFuser {
       #ifdef USE_CUDA
         // Checks for half tensor on GPU
         // const auto device = tt->device();
-        if (tt->device() != kCPUDevice 
+        if (tt->device() != kCPUDevice
           && CUDA_VERSION >= 9
           && tt->scalarType() == at::ScalarType::Half) {
           return true;
         }
-      #endif 
-    } 
+      #endif
+    }
 
     return false;
   }
@@ -204,8 +204,9 @@ struct GraphFuser {
     if(isFusable(node))
       return true;
     // this concat fusion only works when all the inputs are the same size
+    // and we can statically infer the dimension along which we should concat
     // otherwise they cannot partipate in the same map
-    if(node->kind() == aten::cat && allOutputsHaveSameSize(node))
+    if(node->kind() == aten::cat && node->get<int64_t>(attr::dim) && allOutputsHaveSameSize(node))
       return true;
 
     return false;
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index 4d997bb8017a0..5fd438280a35e 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -1,5 +1,7 @@
 #include "torch/csrc/jit/passes/peephole.h"
+
 #include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 
 namespace torch { namespace jit {
@@ -28,7 +30,7 @@ void PeepholeOptimize(Block * block) {
         if (!n->input()->isTensor()) break;
         // the sizes are dynamic
         if(n->inputs().size() != 1) break;
-        if (n->is(attr::size) == n->input()->type()->expect<TensorType>()->sizes()) {
+        if (n->get<std::vector<int64_t>>(attr::size) == n->input()->type()->expect<TensorType>()->sizes()) {
           n->output()->replaceAllUsesWith(n->input());
           // Let DCE clean up any unused nodes at this point
         }
@@ -56,35 +58,24 @@ void PeepholeOptimize(Block * block) {
            }
         }
       } break;
-      // Fuse mm + add into addmm
       case aten::add: {
-        // Must have two inputs
-        if (n->inputs().size() != 2) {
-          continue;
-        }
-        // Alpha parameter must be 1.0
-        auto alpha = at::Scalar(it->t(attr::alpha));
-        if (alpha.to<double>() != 1.0) {
-          continue;
-        }
+        // mm + add == addmm
+        if (n->inputs().size() == 2 &&
+            n->get<at::Tensor>(attr::alpha) &&
+            tensor_as<double>(*n->get<at::Tensor>(attr::alpha)) == 1. &&
+            n->input(1)->node()->kind() == aten::mm) {
+          WithInsertPoint guard(n);
 
-        auto input_node = n->input(1)->node();
-        // Input must be an mm node
-        if (input_node->kind() != aten::mm) {
-          continue;
-        }
+          auto input_node = n->input(1)->node();
+          SymbolicVariable mat(n->input(0));
+          SymbolicVariable mat1(input_node->input(0));
+          SymbolicVariable mat2(input_node->input(1));
+          SymbolicVariable addmm_value = mat.addmm(mat1, mat2);
 
-        WithInsertPoint guard(n);
-
-        SymbolicVariable mat(n->input(0));
-        SymbolicVariable mat1(input_node->input(0));
-        SymbolicVariable mat2(input_node->input(1));
-        SymbolicVariable addmm_value = mat.addmm(mat1, mat2);
-
-        // Copy shape information from output node
-        ((Value*)addmm_value)->copyMetadata(n->output());
-        n->output()->replaceAllUsesWith(addmm_value);
-        // Let DCE clean up any unused nodes at this point
+          // Copy shape information from output node
+          ((Value*)addmm_value)->copyMetadata(n->output());
+          n->output()->replaceAllUsesWith(addmm_value);
+        }
       } break;
     }
   }
diff --git a/torch/csrc/jit/passes/remove_expands.cpp b/torch/csrc/jit/passes/remove_expands.cpp
index 25286e9ed8f9a..8c679db8c2e52 100644
--- a/torch/csrc/jit/passes/remove_expands.cpp
+++ b/torch/csrc/jit/passes/remove_expands.cpp
@@ -7,7 +7,7 @@ static void RemoveExpands(Block* block) {
        ++it) {
     for (auto sub : it->blocks())
       RemoveExpands(sub);
-    if (it->kind() == aten::expand && it->hasAttribute(attr::implicit) && it->i(attr::implicit)) {
+    if (it->kind() == aten::expand && it->get<int64_t>(attr::implicit) != static_cast<int64_t>(0)) {
       it->output()->replaceAllUsesWith(it->input());
       it.destroyCurrent();
     }
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index f8239c5a6457c..5fe1358a0010b 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -150,6 +150,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
   }
   std::vector<TensorType*> types;
   bool present;
+  // TODO (apaszke): This currently assumes that inputs are pretty much tensor-only. Need to fix that.
   std::tie(types, present) = gatherTypes(node->inputs());
   if(!present) {
     return setDynamicType(node);
@@ -171,10 +172,12 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     return true;
   };
 
+  // TODO (apaszke): Those currently assume that a lot of stuff is constant, which might
+  // not be the case if they are produced by script.
   switch(node->kind()) {
-    //TODO: for expensive ops we can directly encode their shape propagation
+    // For expensive ops we can directly encode their shape propagation
     // here, otherwise we fallback to running a fake version of the op
-    // to get a quick and dirty propagation
+    // to get a quick and dirty propagation.
     case aten::add:
     case aten::sub:
     case aten::mul:
@@ -236,12 +239,16 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
                           {AKind::i, attr::keepdim}})) {
         auto tp = types.at(0);
         auto sizes = tp->sizes();
-        int64_t dim = node->is(attr::dim).at(0);
-        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
-        if (node->i(attr::keepdim)) {
-          sizes.at(dim) = 1;
-        } else {
-          sizes.erase(sizes.begin() + dim);
+        auto dims = node->get<std::vector<int64_t>>(attr::dim).value();
+        bool keepdim = node->get<int64_t>(attr::keepdim).value();
+        std::reverse(dims.begin(), dims.end());
+        for (int64_t dim : dims) {
+          SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+          if (keepdim) {
+            sizes.at(dim) = 1;
+          } else {
+            sizes.erase(sizes.begin() + dim);
+          }
         }
         node->output()->setType(tp->withSizes(sizes));
       } else if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) {
@@ -254,7 +261,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
         auto tp = types.at(0);
         auto sizes = tp->sizes();
         auto strides = tp->strides();
-        int64_t dim = node->i(attr::dim);
+        int64_t dim = node->get<int64_t>(attr::dim).value();
         SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
         if (sizes.at(dim) == 1) {
           sizes.erase(sizes.begin() + dim);
@@ -269,7 +276,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
         auto tp = types.at(0);
         auto sizes = tp->sizes();
         auto strides = tp->strides();
-        int64_t dim = node->i(attr::dim);
+        int64_t dim = node->get<int64_t>(attr::dim).value();
         SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) <= sizes.size());
         sizes.insert(sizes.begin() + dim, 1);
         strides.insert(strides.begin() + dim, 1);
@@ -279,7 +286,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     case aten::view: {
       if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
                          {{AKind::is, attr::size}})) {
-        auto sizes = node->is(attr::size);
+        auto sizes = node->get<std::vector<int64_t>>(attr::size).value();
         bool inferred = false;
         size_t inferred_idx;
         int64_t size_product = 1;
@@ -314,7 +321,7 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
                         {{AKind::i, attr::dim}})) {
         auto ten = types.at(0);
         auto index = types.at(1);
-        int64_t dim = node->i(attr::dim);
+        int64_t dim = node->get<int64_t>(attr::dim).value();
         SHAPE_ASSERT(index->sizes().size() == 1);
         SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < ten->sizes().size());
         std::vector<int64_t> sizes = ten->sizes();
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index dfbeb3361b3d6..12417390478a4 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -52,6 +52,11 @@ struct SymbolicVariable {
       return (int64_t) i == s.toLong();
     }
   }
+  // TODO (apaszke): Use this instead of attribute setters
+  template<typename T>
+  SymbolicVariable insertConstant(T value) const {
+    return v->owningGraph()->insertConstant(std::move(value));
+  }
   SymbolicVariable operator*(const SymbolicVariable rhs) const {
     return create(aten::mul, {*this, rhs})[0].typeLike(*this);
   }
@@ -165,6 +170,13 @@ struct SymbolicVariable {
      ->i_(a("length"), length);
     return r;
   }
+  static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, Value* dim) {
+    Node* n;
+    std::vector<SymbolicVariable> all_inputs = inputs;
+    all_inputs.push_back(dim);
+    auto r = create(aten::cat, all_inputs, 1, &n)[0];
+    return r;
+  }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, int32_t dim) {
     Node* n;
     auto r = create(aten::cat, inputs, 1, &n)[0];
@@ -188,6 +200,11 @@ struct SymbolicVariable {
      ->i_(a("keepdim"), keepdim);
     return r;
   }
+  SymbolicVariable squeeze(Value* dim) const {
+    Node * n;
+    auto r = create(t("squeeze"), {*this, dim}, 1, &n)[0];
+    return r;
+  }
   SymbolicVariable squeeze(int dim) const {
     Node * n;
     auto r = create(t("squeeze"), {*this}, 1, &n)[0];
diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h
index 84162a445a910..36bfda79056a4 100644
--- a/torch/csrc/jit/tensor_conversions.h
+++ b/torch/csrc/jit/tensor_conversions.h
@@ -15,7 +15,7 @@ struct tensor_conversion_error : public std::runtime_error {
 };
 
 template<typename T>
-inline T tensor_as(at::Tensor&& t);
+inline T tensor_as(at::Tensor t);
 
 namespace detail {
 
@@ -79,7 +79,7 @@ struct tensor_as_impl<at::Scalar> {
 }
 
 template<typename T>
-inline T tensor_as(at::Tensor&& t) {
+inline T tensor_as(at::Tensor t) {
   return detail::tensor_as_impl<T>()(std::move(t));
 }
 
@@ -109,6 +109,10 @@ inline at::Tensor as_tensor(const at::Scalar& s) {
   return s.toTensor();
 }
 
+inline at::Tensor as_tensor(at::Tensor t) {
+  return t;
+}
+
 template<size_t N>
 inline at::Tensor as_tensor(std::array<bool, N>&& bools) {
   auto r = at::empty({N}, at::kByte);
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index a86059f300495..e93b3b1aeadec 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -62,10 +62,9 @@ autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) {
 
   auto size_var = autograd::make_variable(at::Scalar(var.size(dim)).toTensor());
   auto* value = getValueTrace(var);
-  auto* node = graph->create(aten::size, {value})
-                    ->i_(attr::dim, dim);
+  WithInsertPoint ipoint { graph->block() };
+  auto* node = graph->insertNode(graph->create(aten::size, {value, graph->insertConstant(dim)}));
   node->output()->inferTypeFrom(size_var);
-  graph->appendNode(node);
   setValueTrace(size_var, node->output());
 
   return size_var;
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index a88739c1cc490..0b72453cdc1e2 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -259,7 +259,9 @@ def embedding_bag(g,
 
 def size(g, self, dim):
     if _is_value(dim):
-        raise RuntimeError("ONNX export only supports constant dim values in .size()")
+        if dim.node().kind() != 'onnx::Constant':
+            raise RuntimeError("ONNX export only supports constant dim values in .size()")
+        dim = int(dim.node().t('value'))
     full_shape = g.op("Shape", self)
     return select(g, full_shape, dim=0, index=dim)
 

From a3a6ab60cdb17845a4f46d56451638277559cc6a Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Fri, 20 Jul 2018 10:59:22 -0700
Subject: [PATCH 06/23] Fix the error in UnpackSegmentsOp when calculating the
 gradient with "max_length" argument (#9598)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9598

The "max_length" should be passed to UnPackSegmentsOp if "max_length" is given when calling PackSegmentsOp.

Reviewed By: jerryzh168

Differential Revision: D8919799

fbshipit-source-id: 8c97aa717b69177b8a5d5d56892817d488853840
---
 caffe2/operators/pack_segments.cc            | 11 +++++++++--
 caffe2/operators/pack_segments.cu            |  7 ++++++-
 caffe2/operators/pack_segments.h             |  7 +++++--
 caffe2/python/operator_test/pack_ops_test.py |  1 +
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/caffe2/operators/pack_segments.cc b/caffe2/operators/pack_segments.cc
index 7fcbf917e28fa..2c2f3fdc4fafc 100644
--- a/caffe2/operators/pack_segments.cc
+++ b/caffe2/operators/pack_segments.cc
@@ -119,7 +119,12 @@ bool UnpackSegmentsOp<CPUContext>::DoRunWithType2() {
 
   CAFFE_ENFORCE_GE(data.ndim(), 2, "DATA should be at least 2-D");
   CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
-
+  if (max_length_ != -1) {
+    CAFFE_ENFORCE_EQ(
+        max_length_,
+        data.dim(1),
+        "max_length should be equal to the second dimension of the packed segments");
+  }
   const T* l = lengths.template data<T>();
 
   TIndex total_l = std::accumulate(l, l + lengths.dim(0), (TIndex)0);
@@ -175,6 +180,7 @@ OPERATOR_SCHEMA(PackSegments)
         "presence_mask",
         "2 dim boolean tensor"
         ", false where packed_tensor is padded, true otherwise.")
+    .Arg("max_length", "The pre-defined max_length for the packed segments")
     .Arg(
         "pad_minf",
         "Padding number in the packed segments. Use true to pad \
@@ -191,7 +197,8 @@ OPERATOR_SCHEMA(UnpackSegments)
         "lengths",
         "1-d int/long tensor contains the length in each of the input.")
     .Input(1, "tensor", "N+1 dim Tensor.")
-    .Output(0, "packed_tensor", "N dim Tensor");
+    .Output(0, "packed_tensor", "N dim Tensor")
+    .Arg("max_length", "The pre-defined max_length for the packed segments");
 
 class GetPackSegmentsGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/pack_segments.cu b/caffe2/operators/pack_segments.cu
index 374266cc66f52..8e4bdc49339c1 100644
--- a/caffe2/operators/pack_segments.cu
+++ b/caffe2/operators/pack_segments.cu
@@ -256,7 +256,12 @@ bool UnpackSegmentsOp<CUDAContext>::DoRunWithType2() {
 
   CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
   CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTH should be 1-D");
-
+  if (max_length_ != -1) {
+    CAFFE_ENFORCE_EQ(
+        max_length_,
+        data.dim(1),
+        "max_length should be equal to the packed segments");
+  }
   // Compute prefix sum over the lengths
   array_prefix_sum_exclusive<T>(
       lengths_ptr, num_seq, dev_buffer_, dev_lengths_prefix_sum_, context_);
diff --git a/caffe2/operators/pack_segments.h b/caffe2/operators/pack_segments.h
index 091d7852a013a..c35299d66d1a2 100644
--- a/caffe2/operators/pack_segments.h
+++ b/caffe2/operators/pack_segments.h
@@ -16,7 +16,6 @@ template <class Context>
 class PackSegmentsOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  // USE_SIMPLE_CTOR_DTOR(PackSegmentsOp)
   USE_DISPATCH_HELPER;
 
   PackSegmentsOp(const OperatorDef& operator_def, Workspace* ws)
@@ -62,9 +61,12 @@ template <class Context>
 class UnpackSegmentsOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(UnpackSegmentsOp)
   USE_DISPATCH_HELPER;
 
+  UnpackSegmentsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        max_length_(OperatorBase::GetSingleArgument<int>("max_length", -1)) {}
+
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int, long>>::call(this, Input(LENGTHS));
   }
@@ -78,6 +80,7 @@ class UnpackSegmentsOp final : public Operator<Context> {
   INPUT_TAGS(LENGTHS, DATA);
 
  private:
+  TIndex max_length_;
   Tensor<Context> dev_buffer_;
   Tensor<Context> dev_lengths_prefix_sum_;
   Tensor<Context> dev_max_length_;
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 97b2eaf99dd71..3935ca8c8f17a 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -101,6 +101,7 @@ def test_pack_with_max_length_ops(self, num_seq, cell_size, gc, dc):
             'UnpackSegments',
             ['l', 't'],
             ['newd'],
+            max_length=max_length,
             device_option=gc))
         assert(workspace.FetchBlob('t').shape[1] == max_length)
         assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())

From d3688861ec8607af0e7bf4119911053f13aee500 Mon Sep 17 00:00:00 2001
From: vmirly <mirjalil@msu.edu>
Date: Fri, 20 Jul 2018 11:13:07 -0700
Subject: [PATCH 07/23] Fixed a missing '=' in LPPoolNd repr function (#9629)

Summary:
In the repr funciton of LPPoolNd(..) class, there was a missing '='. (`kernel_size{kernel_size}`)

Link to line in the code: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/pooling.py#L694

Original:

       return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \
              'ceil_mode={ceil_mode}'.format(**self.__dict__)

Fixed:

       return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
              'ceil_mode={ceil_mode}'.format(**self.__dict__)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9629

Differential Revision: D8932913

Pulled By: soumith

fbshipit-source-id: 9030dff6b14659b5c7b6992d87ef53ec8891f674
---
 torch/nn/modules/pooling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 1f00b1cb10977..75ab843e5413b 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -691,7 +691,7 @@ def __init__(self, norm_type, kernel_size, stride=None, ceil_mode=False):
         self.ceil_mode = ceil_mode
 
     def extra_repr(self):
-        return 'norm_type={norm_type}, kernel_size{kernel_size}, stride={stride}, ' \
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
             'ceil_mode={ceil_mode}'.format(**self.__dict__)
 
 

From bae156a481351a7f2d0f5f2dbced5dfab2c58f19 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Fri, 20 Jul 2018 11:27:59 -0700
Subject: [PATCH 08/23] Support (some) CUDA Lapack on n-dimensional empty
 tensors.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9631

Reviewed By: ezyang

Differential Revision: D8933202

Pulled By: gchanan

fbshipit-source-id: 1ade4ca439bf26aa921df1da83a827d860f8f48f
---
 aten/src/THC/generic/THCTensorMathMagma.cu | 92 ++++++++++++----------
 test/test_torch.py                         |  4 +-
 2 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index 0de79233c122b..c942a17c4c0df 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -26,7 +26,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src,
 
 static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self)
 {
-  THAssert(self->_dim() == 2);
+  THAssert(self->dim() == 2);
   size_t len = THCTensor_(nElement)(state, self)*sizeof(real);
   THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1);
   THCTensor *selfc = THCTensor_(newContiguous)(state, temp);
@@ -39,7 +39,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self
 
 static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
 {
-  THAssert(src->_dim() == 2);
+  THAssert(src->dim() == 2);
   if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0))
   {
     THCTensor_(retain)(state, self);
@@ -194,46 +194,47 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
   THCTensor *input = THCTensor_(newColumnMajor)(state, rv_, a);
   real *input_data = THCTensor_(data)(state, input);
 
-  // eigen values and workspace
-  real *w = th_magma_malloc_pinned<real>(n);
-  real *wA = th_magma_malloc_pinned<real>(lda * n);
+  if (n > 0) {
+    // eigen values and workspace
+    real *w = th_magma_malloc_pinned<real>(n);
+    real *wA = th_magma_malloc_pinned<real>(lda * n);
 
-  // compute optimal size of work array
-  int info;
-  real lwork;
-  int liwork;
+    // compute optimal size of work array
+    int info;
+    real lwork;
+    int liwork;
 
 #if defined(THC_REAL_IS_FLOAT)
-  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+    magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
 #else
-  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
+    magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
 #endif
 
-  real *work = th_magma_malloc_pinned<real>((size_t)lwork);
-  int *iwork = th_magma_malloc_pinned<int>(liwork);
+    real *work = th_magma_malloc_pinned<real>((size_t)lwork);
+    int *iwork = th_magma_malloc_pinned<int>(liwork);
 
   // compute eigenvalues and, optionally, eigenvectors
 #if defined(THC_REAL_IS_FLOAT)
-  magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+    magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
 #else
-  magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
+    magma_dsyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
 #endif
 
-  // copy eigen values from w to re_
-  if (info == 0)
-    THCTensor_(copyArray1d)(state, re_, w, n);
-
-  magma_free_pinned(iwork);
-  magma_free_pinned(work);
-  magma_free_pinned(wA);
-  magma_free_pinned(w);
+    // copy eigen values from w to re_
+    if (info == 0)
+      THCTensor_(copyArray1d)(state, re_, w, n);
 
-  // check error value
-  if (info > 0)
-    THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
-  else if (info < 0)
-    THError("MAGMA syev : Argument %d : illegal value", -info);
+    magma_free_pinned(iwork);
+    magma_free_pinned(work);
+    magma_free_pinned(wA);
+    magma_free_pinned(w);
 
+    // check error value
+    if (info > 0)
+      THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+    else if (info < 0)
+      THError("MAGMA syev : Argument %d : illegal value", -info);
+  }
   THCTensor_(freeCopyTo)(state, input, rv_);
 #else
   THError(NoMagma(syev));
@@ -243,7 +244,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
 {
 #ifdef USE_MAGMA
-  THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional");
+  THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional");
   THArgCheck(a_->size(0) == a_->size(1), 3, "A should be square");
 
   magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
@@ -263,34 +264,39 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
     ldvr = n;
   }
 
-  real wkopt;
-  int info;
+  real *work_data = nullptr;
 
+  if (n > 0) {
+    int info;
+    real wkopt;
 #if defined(THC_REAL_IS_FLOAT)
-  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+    magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
 #else
-  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
+    magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
 #endif
 
-  int lwork = (int) wkopt;
-  real *work_data = th_magma_malloc_pinned<real>(lwork);
+    int lwork = (int) wkopt;
+    work_data = th_magma_malloc_pinned<real>(lwork);
 
 #if defined(THC_REAL_IS_FLOAT)
-  magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+    magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
 #else
-  magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
+    magma_dgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
 #endif
 
-  if (info > 0)
-    THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
-  else if (info < 0)
-    THError("MAGMA geev : Argument %d : illegal value", -info);
+    if (info > 0)
+      THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
+    else if (info < 0)
+      THError("MAGMA geev : Argument %d : illegal value", -info);
+  }
 
   {
     THCTensor_(resize2d)(state, re_, 2, n);
     THCTensor *re = THCTensor_(newContiguous)(state, re_);
-    THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
-    THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+    if (n > 0) {
+      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
+      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+    }
     THCTensor_(freeCopyTo)(state, re, re_);
     THCTensor_(transpose)(state, re_, NULL, 0, 1);
   }
diff --git a/test/test_torch.py b/test/test_torch.py
index bf8f910252959..d1370e30cb826 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6320,9 +6320,7 @@ def test_lapack_empty(self):
         # numpy/sci often has a direct wrapper (e.g. lu_factor) and a wrapper that "does the right thing"
         # (e.g. lu).  We often name our functions identically to the lapack function, so it will take work
         # to name / migrate-to better wrappers.
-
-        # FIXME: enable CUDA tests.
-        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
 
             def fn(torchfn, *args):

From 8a0fe0a5889bd8c7404102dfd007bc4c2ffc3512 Mon Sep 17 00:00:00 2001
From: Kittipat Virochsiri <kittipat@fb.com>
Date: Fri, 20 Jul 2018 11:45:41 -0700
Subject: [PATCH 09/23] set_input_record() should always add external input
 (#9636)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9636

Make sure that the blobs are registered to the net

Reviewed By: pjh5

Differential Revision: D8924883

fbshipit-source-id: f09422a2d4d5ba8bf6cfbfd00172097b5ab1fcd6
---
 caffe2/python/core.py      |  7 ++++---
 caffe2/python/core_test.py | 20 +++++++++++++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 9fef5724ad2f6..1423cfea3a6c0 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -1947,9 +1947,10 @@ def set_input_record(self, input_record):
                 self._input_record = schema.NewRecord(self, input_record)
         else:
             self._input_record = input_record
-            for blob in input_record.field_blobs():
-                if blob not in self.external_inputs:
-                    self.AddExternalInput(blob)
+
+        for blob in self._input_record.field_blobs():
+            if blob not in self.external_inputs:
+                self.AddExternalInput(blob)
         return self._input_record
 
     def recover_input_record_by_prefix(self, prefix):
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 2a4afc82c8067..b7099d20eae86 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
+from caffe2.python import core, workspace, schema, test_util
 from caffe2.python.task import Node, Task
 
 
@@ -199,6 +199,24 @@ def net_assert(net, num_ops, inputs, outputs, internals):
         n._CheckLookupTables()
 
 
+class TestExternalInputs(test_util.TestCase):
+    def testSetInputRecordWithBlobs(self):
+        net = core.Net("test")
+        record = schema.NewRecord(net, schema.Struct(
+            ("x", schema.Scalar(np.float)),
+        ))
+        input_record = net.set_input_record(record)
+        self.assertTrue(net.BlobIsDefined(input_record.x()))
+        self.assertIn(input_record.x(), net.external_inputs)
+
+    def testSetInputRecordWithoutBlobs(self):
+        net = core.Net("test")
+        record = schema.Struct(("x", schema.Scalar(np.float)))
+        input_record = net.set_input_record(record)
+        self.assertTrue(net.BlobIsDefined(input_record.x()))
+        self.assertIn(input_record.x(), net.external_inputs)
+
+
 class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()

From 1003ccfa15e944251a65ba2289f25e8f1ed14a46 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@nvidia.com>
Date: Fri, 20 Jul 2018 12:42:48 -0700
Subject: [PATCH 10/23] Creates CUDAContext (#9435)

Summary:
ezyang noticed that the CUDAStream files lived under ATen/ despite being CUDA-specific, and suggested porting them to ATen/cuda and exposing them with a new CUDAContext. This PR does that. It also:

- Moves ATen's CUDA-specific exceptions for ATen/cudnn to ATen/cuda for consistency
- Moves getDeviceProperties() and getCurrentCUDASparseHandle() to CUDAContext from CUDAHooks

The separation between CUDAContext and CUDAHooks is straightforward. Files that are in CUDA-only builds should rely on CUDAContext, while CUDAHooks is for runtime dispatch in files that can be included in CPU-only builds. A comment in CUDAContext.h explains this pattern. Acquiring device properties and CUDA-specific handles is something only done in builds with CUDA, for example, so I moved them from CUDAHooks to CUDAContext.

This PR will conflict with #9277 and I will merge with master after #9277 goes in.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9435

Reviewed By: soumith

Differential Revision: D8917236

Pulled By: ezyang

fbshipit-source-id: 219718864234fdd21a2baff1dd3932ff289b5751
---
 aten/src/ATen/CUDAGuard.h                     | 110 ----------
 aten/src/ATen/CUDAStream.cpp                  | 197 -----------------
 aten/src/ATen/CUDAStream.h                    | 109 ----------
 aten/src/ATen/Context.h                       |  52 -----
 aten/src/ATen/cuda/CUDAApplyUtils.cuh         |  33 +--
 aten/src/ATen/cuda/CUDAContext.cpp            |  76 +++++++
 aten/src/ATen/cuda/CUDAContext.h              |  70 +++++++
 aten/src/ATen/cuda/CUDAGuard.h                | 108 ++++++++++
 aten/src/ATen/cuda/CUDAStream.cpp             | 198 ++++++++++++++++++
 aten/src/ATen/cuda/CUDAStream.h               | 108 ++++++++++
 aten/src/ATen/cuda/Exceptions.h               |  20 ++
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       |  42 +---
 aten/src/ATen/cuda/detail/CUDAHooks.h         |   5 -
 aten/src/ATen/cudnn/Descriptors.h             |   5 +-
 aten/src/ATen/cudnn/Exceptions.h              |  17 --
 aten/src/ATen/cudnn/Handles.cpp               |   2 +-
 aten/src/ATen/cudnn/Utils.h                   |   1 +
 aten/src/ATen/detail/CUDAHooksInterface.cpp   |  26 ---
 aten/src/ATen/detail/CUDAHooksInterface.h     |  40 +---
 aten/src/ATen/native/cuda/CuFFTPlanCache.h    |   3 +-
 aten/src/ATen/native/cuda/Embedding.cu        |   6 +-
 aten/src/ATen/native/cuda/EmbeddingBag.cu     |   7 +-
 aten/src/ATen/native/cuda/Gesv.cu             |   3 +-
 aten/src/ATen/native/cuda/RoiPooling.cu       |   5 +-
 aten/src/ATen/native/cuda/SoftMax.cu          |   7 +-
 aten/src/ATen/native/cuda/SpectralOps.cu      |   5 +-
 aten/src/ATen/native/cuda/SummaryOps.cu       |   6 +-
 .../ATen/native/cuda/TensorTransformations.cu |   5 +-
 aten/src/ATen/native/cuda/Unique.cu           |   4 +-
 .../ATen/native/cudnn/AffineGridGenerator.cpp |   1 +
 aten/src/ATen/native/cudnn/BatchNorm.cpp      |   1 +
 aten/src/ATen/native/cudnn/Conv.cpp           |   1 +
 aten/src/ATen/native/cudnn/GridSampler.cpp    |   1 +
 aten/src/ATen/native/cudnn/RNN.cpp            |   1 +
 .../ATen/native/sparse/cuda/SparseCUDABlas.cu |   5 +-
 .../native/sparse/cuda/SparseCUDATensor.cu    |   3 +-
 .../sparse/cuda/SparseCUDATensorMath.cu       |   7 +-
 aten/src/ATen/test/stream_test.cpp            | 109 +++++-----
 aten/src/THC/THCCachingAllocator.cpp          |   5 +-
 aten/src/THC/THCGeneral.cpp                   |  18 +-
 aten/src/THC/THCStream.cpp                    |  16 +-
 test/cpp_extensions/cudnn_extension.cpp       |   2 +-
 torch/csrc/autograd/functions/comm.cpp        |   2 +-
 torch/csrc/autograd/functions/comm.h          |   4 +-
 torch/csrc/autograd/profiler.h                |   3 +-
 torch/csrc/cuda/Module.cpp                    |   3 +-
 torch/csrc/cuda/comm.cpp                      |   8 +-
 torch/csrc/cuda/comm.h                        |   3 +-
 torch/csrc/cuda/python_comm.cpp               |   6 +-
 torch/csrc/jit/fusion_compiler.cpp            |   3 +-
 50 files changed, 742 insertions(+), 730 deletions(-)
 create mode 100644 aten/src/ATen/cuda/CUDAContext.cpp
 create mode 100644 aten/src/ATen/cuda/CUDAContext.h
 create mode 100644 aten/src/ATen/cuda/CUDAGuard.h
 create mode 100644 aten/src/ATen/cuda/CUDAStream.cpp
 create mode 100644 aten/src/ATen/cuda/CUDAStream.h
 create mode 100644 aten/src/ATen/cuda/Exceptions.h

diff --git a/aten/src/ATen/CUDAGuard.h b/aten/src/ATen/CUDAGuard.h
index 8027084caa7c3..e69de29bb2d1d 100644
--- a/aten/src/ATen/CUDAGuard.h
+++ b/aten/src/ATen/CUDAGuard.h
@@ -1,110 +0,0 @@
-#pragma once
-
-#include <ATen/ArrayRef.h>
-#include <ATen/CUDAStream.h>
-#include <ATen/Context.h>
-#include <ATen/DeviceGuard.h>
-
-#include <cstddef>
-#include <vector>
-
-namespace at {
-
-/// A variant of `DeviceGuard` that augments it with an understanding of CUDA
-/// streams. This guard can not only set and reset the current CUDA device, but
-/// also set and reset the current CUDA stream. It is important to note that
-/// because a CUDA stream is intrinsically associated with the CUDA device to
-/// which it is bound, setting the CUDA stream *also* sets the current CUDA
-/// device to that of the stream.
-struct CUDAGuard {
-  /// Default constructor, does nothing and causes no change in the current
-  /// stream or device until `set_stream` or `set_device` is called.
-  CUDAGuard() = default;
-
-  /// Sets the CUDA stream and its associated device as the current one (calls
-  /// `set_stream`).
-  explicit CUDAGuard(const CUDAStream& stream) {
-    set_stream(stream);
-  }
-
-  /// Calls `set_device` with the given index.
-  explicit CUDAGuard(int32_t device) {
-    set_device(device);
-  }
-
-  CUDAGuard(const CUDAGuard&) = delete;
-  CUDAGuard& operator=(const CUDAGuard&) = delete;
-
-  /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The
-  /// moved-from `CUDAGuard` is modified such that its destruction has no
-  /// effect (does not reset the stream or device).
-  CUDAGuard(CUDAGuard&& other) noexcept = default;
-
-  /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The
-  /// moved-from `CUDAGuard` is modified such that its destruction has no
-  /// effect (does not reset the stream or device).
-  CUDAGuard& operator=(CUDAGuard&& other) {
-    device_guard_ = std::move(other.device_guard_);
-    original_streams_ = std::move(other.original_streams_);
-    other.original_streams_.clear();
-    return *this;
-  }
-
-  /// Resets the CUDA stream on each device to the one that was active upon
-  /// construction.
-  ~CUDAGuard() {
-    if (!original_streams_.empty()) {
-      for (size_t device = 0; device < original_streams_.size(); ++device) {
-        globalContext().uncheckedSetCurrentCUDAStreamOnDevice(
-            device, original_streams_[device]);
-      }
-    }
-  }
-
-  /// Sets the current CUDA device to the device associated with the given
-  /// stream, and then sets the current stream on that device to the one given.
-  void set_stream(const CUDAStream& stream) {
-    device_guard_.set_index(stream.device());
-    // If we haven't stored the current stream yet, store it now.
-    if (original_streams_.empty()) {
-      const size_t device_count = globalContext().getNumGPUs();
-      original_streams_.reserve(device_count);
-      for (size_t device = 0; device < device_count; ++device) {
-        original_streams_.push_back(
-            globalContext().getCurrentCUDAStreamOnDevice(device));
-      }
-    }
-    globalContext().setCurrentCUDAStreamOnDevice(
-        device_guard_.last_index(), stream);
-  }
-
-  /// Sets the CUDA device to the given one.
-  void set_device(int32_t device) {
-    device_guard_.set_index(device);
-  }
-
-  /// Returns the CUDA streams that were active in the first call to
-  /// `set_stream`. If there was no such call, the returned container is
-  /// empty.
-  ArrayRef<CUDAStream> original_streams() const noexcept {
-    return original_streams_;
-  }
-
-  /// Returns the device that was set upon construction of the guard.
-  int32_t original_device() const noexcept {
-    return device_guard_.original_index();
-  }
-
-  /// Returns the last device that was set via `set_device`, if any.
-  int32_t last_device() const noexcept {
-    return device_guard_.last_index();
-  }
-
- private:
-  /// The guard for the current device.
-  DeviceGuard device_guard_;
-  /// The original streams that were active on all devices.
-  std::vector<CUDAStream> original_streams_;
-};
-
-} // namespace at
diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp
index b8b8d588ffbfc..e69de29bb2d1d 100644
--- a/aten/src/ATen/CUDAStream.cpp
+++ b/aten/src/ATen/CUDAStream.cpp
@@ -1,197 +0,0 @@
- #include "ATen/CUDAStream.h"
-#include "ATen/Error.h"
-#include "ATen/detail/CUDAHooksInterface.h"
-
-#include <mutex>
-
-// Internal implementation is entirely hidden
-struct CUDAStreamInternals {
-  bool is_destructible;
-  std::atomic<int> refcount;
-  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
-  cudaStream_t stream;
-};
-
-namespace at {
-
-namespace detail {
-
-  /*
-  * Stream state
-  */
-  static constexpr cudaStream_t DEFAULT_STREAM = 0;
-
-  static std::once_flag init_flag;
-  static int64_t num_gpus;
-  static CUDAStreamInternals* default_streams;
-  static thread_local CUDAStreamInternals** current_streams = nullptr;
-
-  // Creates a(n indestructible) default stream for each device
-  // Note: the default stream on each device is signified by a zero
-  // value for the pointer, and so is not actually created as usual.
-  // In particular, we don't need to switch devices when creating the
-  // streams.
-  static void initDefaultCUDAStreams() {
-    num_gpus = getCUDAHooks().getNumGPUs();
-    default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals));
-    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
-      default_streams[i].is_destructible = false;
-      default_streams[i].refcount = 0;
-      default_streams[i].device = i;
-      default_streams[i].stream = DEFAULT_STREAM;
-    }
-  }
-
-  // Init front-end to ensure initialization only occurs once
-  static void initCUDAStreamsOnce() {
-    // Inits default streams (once, globally)
-    std::call_once(init_flag, initDefaultCUDAStreams);
-
-    // Inits current streams (thread local) to default streams
-    if (current_streams) return;
-    current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*));
-    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
-      current_streams[i] = &default_streams[i];
-    }
-  }
-
-  /*
-  * Pointer-based stream API
-  */
-
-  // Helper to return the current device
-  static inline int64_t current_device() {
-    int cur_device;
-    DynamicCUDAInterface::get_device(&cur_device);
-    return cur_device;
-  }
-
-  // Helper to verify the GPU index is valid
-  static inline void check_gpu(int64_t device) {
-    AT_ASSERT(device >= 0 && device < num_gpus);
-  }
-
-  CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) {
-    initCUDAStreamsOnce();
-    check_gpu(device);
-    return &default_streams[device];
-  }
-  CUDAStreamInternals* CUDAStream_getDefaultStream() {
-    return CUDAStream_getDefaultStreamOnDevice(current_device());
-  }
-
-  // Creates (and retains) and new cuda stream
-  CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) {
-    CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals));
-    internals->is_destructible = true;
-    internals->refcount = 1;
-    internals->device = current_device();
-    DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority);
-    return internals;
-  }
-
-  // Note: despite not being "unsafe," is using these methods in a multithreaded
-  // environment then the caller must be sure that streams are valid
-  // when they're requested. These methods will throw an error if an
-  // invalid stream is requested.
-  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) {
-    initCUDAStreamsOnce();
-    check_gpu(device);
-    auto cur = current_streams[device];
-    AT_ASSERT(CUDAStream_retain(cur));
-    return cur;
-  }
-  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() {
-    return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device());
-  }
-
-  // Note: these unsafe methods do not retain the stream before returning it.
-  // This is unsafe behavior and these methods SHOULD NOT BE USED.
-  // They are here only for legacy compatibility.
-  CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) {
-    initCUDAStreamsOnce();
-    check_gpu(device);
-    return current_streams[device];
-  }
-  CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() {
-    return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device());
-  }
-
-  void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
-    initCUDAStreamsOnce();
-    check_gpu(device);
-    AT_ASSERT(ptr);
-    AT_ASSERT(ptr->device == device);
-    AT_ASSERT(CUDAStream_retain(ptr));
-
-    CUDAStream_free(current_streams[device]);
-    current_streams[device] = ptr;
-  }
-
-  void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
-    initCUDAStreamsOnce();
-    CUDAStream_uncheckedFree(current_streams[device]);
-    current_streams[device] = ptr;
-  }
-
-  void CUDAStream_setStream(CUDAStreamInternals* ptr) {
-    CUDAStream_setStreamOnDevice(current_device(), ptr);
-  }
-
-  // Getters
-  cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) {
-    AT_ASSERT(ptr);
-    return ptr->stream;
-  }
-
-  int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
-    AT_ASSERT(ptr);
-    return ptr->device;
-  }
-
-  // Memory management
-  // Note: only destructible (non-default) streams are ref counted
-  bool CUDAStream_retain(CUDAStreamInternals* ptr) {
-    AT_ASSERT(ptr);
-    if (ptr->is_destructible) return(++ptr->refcount > 1);
-    return true;
-  }
-
-  void CUDAStream_free(CUDAStreamInternals*& ptr) {
-    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
-      AT_ASSERT(ptr->refcount == 0);
-      DynamicCUDAInterface::cuda_stream_destroy(ptr->stream);
-      free(ptr);
-      ptr = nullptr;
-    }
-  }
-  void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) {
-    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
-      DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream);
-      free(ptr);
-      ptr = nullptr;
-    }
-  }
-
-} // namespace detail
-
-  /*
-  * CUDAStream functions
-  */
-
-   // Copy constructor
-  CUDAStream::CUDAStream(const CUDAStream& other) {
-    AT_ASSERT(other.internals_);
-    AT_ASSERT(detail::CUDAStream_retain(other.internals_));
-
-    internals_ = other.internals_;
-  }
-
-  // Move constructor
-  CUDAStream::CUDAStream(CUDAStream&& other) {
-    AT_ASSERT(other.internals_);
-
-    std::swap(internals_, other.internals_);
-  }
-
-} // namespace at
diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h
index c5fc8111e13be..e69de29bb2d1d 100644
--- a/aten/src/ATen/CUDAStream.h
+++ b/aten/src/ATen/CUDAStream.h
@@ -1,109 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <utility>
-
-/*
-* A CUDA stream interface with no CUDA build dependency.
-*
-* Includes the CUDAStream RAII class and a pointer-based stream API.
-*
-* The ATen Context interface should be preferred when working with streams.
-*/
-
-// Forward-declares cudaStream_t to avoid depending on CUDA in CPU builds
-// Note: this is the internal CUDA runtime typedef for cudaStream_t
-struct CUstream_st;
-typedef struct CUstream_st* cudaStream_t;
-
-// Forward-declares internals
-struct CUDAStreamInternals;
-
-namespace at {
-
-namespace detail {
-
-// Pointer-based API (for internal use)
-// Note: ATen/Context is preferred to work with streams safely
-CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getDefaultStream();
-
-CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
-
-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
-
-// Note: these Unsafe gets should NEVER be used and are only here for legacy
-// purposes. Once those uses are gone they should be removed.
-CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
-CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
-
-void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
-void CUDAStream_uncheckedSetStreamOnDevice(
-    int64_t device,
-    CUDAStreamInternals* internals);
-void CUDAStream_setStream(CUDAStreamInternals* internals);
-
-cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
-int64_t CUDAStream_device(CUDAStreamInternals*);
-
-bool CUDAStream_retain(CUDAStreamInternals*);
-void CUDAStream_free(CUDAStreamInternals*&);
-void CUDAStream_uncheckedFree(CUDAStreamInternals*&);
-
-} // namespace detail
-
-// RAII for a CUDA stream
-// Allows use as a cudaStream_t, copying, moving, and metadata access.
-struct CUDAStream {
-  // Constants
-  static constexpr int32_t DEFAULT_FLAGS = 1; // = cudaStreamNonBlocking;
-  static constexpr int32_t DEFAULT_PRIORITY = 0;
-
-  // Constructors
-  CUDAStream() = default;
-  /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false)
-      : internals_{internals} {
-    if (retain) {
-      detail::CUDAStream_retain(internals_);
-    }
-  }
-
-  // Destructor
-  ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); }
-
-  // Copy constructor
-  CUDAStream(const CUDAStream& other);
-
-  // Move constructor
-  CUDAStream(CUDAStream&& other);
-
-  // Assignment operator
-  CUDAStream& operator=(CUDAStream other) noexcept {
-    std::swap(internals_, other.internals_);
-    return *this;
-  }
-
-  // Returns true if the CUDAStream is not null.
-  explicit operator bool() const noexcept {
-    return internals_ != nullptr;
-  }
-
-  // Implicit conversion to cudaStream_t
-  operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); }
-
-  // Less than operator (to allow use in sets)
-  friend bool operator<(const CUDAStream& left, const CUDAStream& right) {
-    return left.internals_ < right.internals_;
-  }
-
-  // Getters
-  int64_t device() const { return detail::CUDAStream_device(internals_); }
-  cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); }
-  CUDAStreamInternals* internals() const { return internals_; }
-
-private:
-  CUDAStreamInternals* internals_ = nullptr;
-};
-
-} // namespace at
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 9a9125ccedfe0..969c0dcc6c037 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -79,58 +79,6 @@ class AT_API Context {
     return thc_state.get();
   }
 
-  CUDAStream createCUDAStream() const {
-    return detail::CUDAStream_createAndRetainWithOptions(
-      CUDAStream::DEFAULT_FLAGS
-    , CUDAStream::DEFAULT_PRIORITY
-    );
-  }
-
-  CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) const {
-    return detail::CUDAStream_createAndRetainWithOptions(flags, priority);
-  }
-
-  CUDAStream getDefaultCUDAStream() const {
-    return detail::CUDAStream_getDefaultStream();
-  }
-
-  CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) const {
-    return detail::CUDAStream_getDefaultStreamOnDevice(device);
-  }
-
-  CUDAStream getCurrentCUDAStream() const {
-    return detail::CUDAStream_getAndRetainCurrentStream();
-  }
-
-  CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) const {
-    return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device);
-  }
-
-  void setCurrentCUDAStream(CUDAStream stream) const {
-    return detail::CUDAStream_setStream(stream.internals());
-  }
-
-  void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) const {
-    return detail::CUDAStream_setStreamOnDevice(device, stream.internals());
-  }
-
-  void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream)
-      const {
-    return detail::CUDAStream_uncheckedSetStreamOnDevice(
-        device, stream.internals());
-  }
-
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle() const {
-    return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get());
-  }
-#endif
-  cudaDeviceProp* getCurrentDeviceProperties() const {
-    return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get());
-  }
-  cudaDeviceProp* getDeviceProperties(int device) const {
-    return detail::getCUDAHooks().getDeviceProperties(thc_state.get(), device);
-  }
   int getNumGPUs() const {
     return detail::getCUDAHooks().getNumGPUs();
   }
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index e34cd14548451..a30bed575d3d3 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -3,6 +3,7 @@
 #include "detail/IndexUtils.cuh"
 #include "ATen/TensorUtils.h"
 #include "THC/THCAtomics.cuh"
+#include "ATen/cuda/CUDAContext.h"
 
 //
 // This file contains pointwise operation functions and kernels that
@@ -239,7 +240,7 @@ __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
 inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) {
   if (curDevice == -1) return false;
   uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast<uint64_t>(AT_APPLY_THREADS_PER_BLOCK));
-  uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0];
+  uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
   if (numBlocks > maxGridX)
       numBlocks = maxGridX;
   grid = dim3(numBlocks);
@@ -322,7 +323,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
                         scalar1,                                        \
                         scalar2,                                        \
                         TYPE, A, B>                                     \
-   <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(    \
+   <<<grid, block, 0, at::cuda::getCurrentCUDAStreamOnDevice(curDevice)>>>(    \
        aInfo, bInfo, (TYPE) totalElements, op);
 
 #define HANDLE_B_CASE(TYPE, A, B) {         \
@@ -365,7 +366,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
     bInfo.collapseDims();
 #if CUDA_VERSION < 9000
     if (!(aInfo.isContiguous() && bInfo.isContiguous()))
-        grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+        grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
 
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
@@ -388,17 +389,17 @@ bool CUDA_tensor_apply2(at::Tensor a,
                             scalar1,
                             scalar2,
                           uint64_t, 1, 1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
            aInfo, bInfo, (uint64_t) totalElements, op);
     } else {
 #if CUDA_VERSION < 9000
-      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
       kernelPointwiseApply2<Op,
                             scalar1,
                             scalar2,
                             uint64_t, -1, -1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
            aInfo, bInfo, (uint64_t) totalElements, op);
     }
   }
@@ -502,7 +503,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
                         scalar2,                                        \
                         scalar3,                                        \
                         TYPE, A, B, C>                                  \
-    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
+    <<<grid, block, 0, at::cuda::getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
       aInfo, bInfo, cInfo, (TYPE) totalElements, op);
 
 #define HANDLE_C_CASE(TYPE, A, B, C) {      \
@@ -566,7 +567,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
 
 #if CUDA_VERSION < 9000
     if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()))
-      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
   } else {
@@ -594,11 +595,11 @@ bool CUDA_tensor_apply3(at::Tensor a,
                             scalar2,
                             scalar3,
                             uint64_t, 1, 1, 1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
     } else {
 #if CUDA_VERSION < 9000
-  grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
 
 	kernelPointwiseApply3<Op,
@@ -606,7 +607,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
                         scalar2,
                         scalar3,
                         uint64_t, -1, -1, -1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
     }
   }
@@ -730,7 +731,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
                         scalar3,                                        \
                         scalar4,                                        \
                         TYPE, A, B, C, D>                               \
-    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
+    <<<grid, block, 0, at::cuda::getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
     aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op);
 
 #define HANDLE_D_CASE(TYPE, A, B, C, D) {       \
@@ -813,7 +814,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
 
 #if CUDA_VERSION < 9000
     if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous()))
-      grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims);
   } else {
@@ -846,11 +847,11 @@ bool CUDA_tensor_apply4(at::Tensor a,
                             scalar3,
                             scalar4,
                             uint64_t, 1, 1, 1, 1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
     } else {
 #if CUDA_VERSION < 9000
-  grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
 #endif
 
 	kernelPointwiseApply4<Op,
@@ -859,7 +860,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
                         scalar3,
                         scalar4,
                         uint64_t, -1, -1, -1, -1>
-        <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(
+        <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           aInfo, bInfo, cInfo, dInfo, (uint64_t) totalElements, op);
     }
   }
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
new file mode 100644
index 0000000000000..794cbb44ddd81
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -0,0 +1,76 @@
+#include "ATen/cuda/CUDAContext.h"
+#include "THC/THCGeneral.h"
+
+namespace at { namespace cuda { 
+
+/* Device info */
+int64_t getNumGPUs() {
+  int count;
+  AT_CUDA_CHECK(cudaGetDeviceCount(&count));
+  return count;
+}
+
+int64_t current_device() {
+  int cur_device;
+  AT_CUDA_CHECK(cudaGetDevice(&cur_device));
+  return cur_device;
+}
+
+cudaDeviceProp* getCurrentDeviceProperties() {
+  return THCState_getCurrentDeviceProperties(at::globalContext().getTHCState());
+}
+
+cudaDeviceProp* getDeviceProperties(int64_t device) {
+  return THCState_getDeviceProperties(at::globalContext().getTHCState(), (int)device);
+}
+
+/* Streams */
+CUDAStream createCUDAStream() {
+  return detail::CUDAStream_createAndRetainWithOptions(
+    CUDAStream::DEFAULT_FLAGS
+  , CUDAStream::DEFAULT_PRIORITY
+  );
+}
+
+CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority) {
+  return detail::CUDAStream_createAndRetainWithOptions(flags, priority);
+}
+
+CUDAStream getDefaultCUDAStream() {
+  return detail::CUDAStream_getDefaultStream();
+}
+
+CUDAStream getDefaultCUDAStreamOnDevice(int64_t device) {
+  return detail::CUDAStream_getDefaultStreamOnDevice(device);
+}
+
+CUDAStream getCurrentCUDAStream() {
+  return detail::CUDAStream_getAndRetainCurrentStream();
+}
+
+CUDAStream getCurrentCUDAStreamOnDevice(int64_t device) {
+  return detail::CUDAStream_getAndRetainCurrentStreamOnDevice(device);
+}
+
+void setCurrentCUDAStream(CUDAStream stream) {
+  return detail::CUDAStream_setStream(stream.internals());
+}
+
+void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) {
+  return detail::CUDAStream_setStreamOnDevice(device, stream.internals());
+}
+
+void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) {
+  return detail::CUDAStream_uncheckedSetStreamOnDevice(device, stream.internals());
+}
+
+/* Handles */
+#ifndef __HIP_PLATFORM_HCC__
+  cusparseHandle_t getCurrentCUDASparseHandle() {
+    return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
+  }
+#endif
+
+} // namespace cuda
+
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
new file mode 100644
index 0000000000000..b8fbe67b1ce20
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include "ATen/Context.h"
+#include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/Exceptions.h"
+#include "ATen/ATenGeneral.h"
+
+#include <cstdint>
+
+#include "cuda_runtime_api.h"
+#include "cusparse.h"
+
+namespace at { namespace cuda { 
+
+/*
+A common CUDA interface for ATen.
+
+This interface is distinct from CUDAHooks, which defines an interface that links 
+to both CPU-only and CUDA builds. That interface is intended for runtime dispatch
+and should be used from files that are included in both CPU-only and CUDA builds.
+
+CUDAContext, on the other hand, should be preferred by files only included in
+CUDA builds. It is intended to expose CUDA functionality in a consistent
+manner.
+
+This means there is some overlap between the CUDAContext and CUDAHooks, but
+the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
+use CUDAHooks otherwise.
+
+Note that CUDAContext simply defines an interface with no associated class.
+It is expected that the modules whose functions compose this interface will
+manage their own state. There is only a single CUDA context/state.
+*/
+
+/* Device info */
+AT_API int64_t getNumGPUs();
+
+AT_API int64_t current_device();
+
+AT_API cudaDeviceProp* getCurrentDeviceProperties();
+
+AT_API cudaDeviceProp* getDeviceProperties(int64_t device);
+
+/* Streams */
+AT_API CUDAStream createCUDAStream();
+
+AT_API CUDAStream createCUDAStreamWithOptions(int32_t flags, int32_t priority);
+
+AT_API CUDAStream getDefaultCUDAStream();
+
+AT_API CUDAStream getDefaultCUDAStreamOnDevice(int64_t device);
+
+AT_API CUDAStream getCurrentCUDAStream();
+
+AT_API CUDAStream getCurrentCUDAStreamOnDevice(int64_t device);
+
+AT_API void setCurrentCUDAStream(CUDAStream stream);
+
+AT_API void setCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream);
+
+AT_API void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream);
+
+/* Handles */
+#ifndef __HIP_PLATFORM_HCC__
+  AT_API cusparseHandle_t getCurrentCUDASparseHandle();
+#endif
+
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAGuard.h b/aten/src/ATen/cuda/CUDAGuard.h
new file mode 100644
index 0000000000000..db006625221cf
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAGuard.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/CUDAStream.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/DeviceGuard.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace at { namespace cuda {
+
+/// A variant of `DeviceGuard` that augments it with an understanding of CUDA
+/// streams. This guard can not only set and reset the current CUDA device, but
+/// also set and reset the current CUDA stream. It is important to note that
+/// because a CUDA stream is intrinsically associated with the CUDA device to
+/// which it is bound, setting the CUDA stream *also* sets the current CUDA
+/// device to that of the stream.
+struct CUDAGuard {
+  /// Default constructor, does nothing and causes no change in the current
+  /// stream or device until `set_stream` or `set_device` is called.
+  CUDAGuard() = default;
+
+  /// Sets the CUDA stream and its associated device as the current one (calls
+  /// `set_stream`).
+  explicit CUDAGuard(const CUDAStream& stream) {
+    set_stream(stream);
+  }
+
+  /// Calls `set_device` with the given index.
+  explicit CUDAGuard(int32_t device) {
+    set_device(device);
+  }
+
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard(CUDAGuard&& other) noexcept = default;
+
+  /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard& operator=(CUDAGuard&& other) {
+    device_guard_ = std::move(other.device_guard_);
+    original_streams_ = std::move(other.original_streams_);
+    other.original_streams_.clear();
+    return *this;
+  }
+
+  /// Resets the CUDA stream on each device to the one that was active upon
+  /// construction.
+  ~CUDAGuard() {
+    if (!original_streams_.empty()) {
+      for (size_t device = 0; device < original_streams_.size(); ++device) {
+        uncheckedSetCurrentCUDAStreamOnDevice(device, original_streams_[device]);
+      }
+    }
+  }
+
+  /// Sets the current CUDA device to the device associated with the given
+  /// stream, and then sets the current stream on that device to the one given.
+  void set_stream(const CUDAStream& stream) {
+    device_guard_.set_index(stream.device());
+    // If we haven't stored the current stream yet, store it now.
+    if (original_streams_.empty()) {
+      const size_t device_count = getNumGPUs();
+      original_streams_.reserve(device_count);
+      for (size_t device = 0; device < device_count; ++device) {
+        original_streams_.push_back(getCurrentCUDAStreamOnDevice(device));
+      }
+    }
+    setCurrentCUDAStreamOnDevice(device_guard_.last_index(), stream);
+  }
+
+  /// Sets the CUDA device to the given one.
+  void set_device(int32_t device) {
+    device_guard_.set_index(device);
+  }
+
+  /// Returns the CUDA streams that were active in the first call to
+  /// `set_stream`. If there was no such call, the returned container is
+  /// empty.
+  ArrayRef<CUDAStream> original_streams() const noexcept {
+    return original_streams_;
+  }
+
+  /// Returns the device that was set upon construction of the guard.
+  int32_t original_device() const noexcept {
+    return device_guard_.original_index();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any.
+  int32_t last_device() const noexcept {
+    return device_guard_.last_index();
+  }
+
+ private:
+  /// The guard for the current device.
+  at::DeviceGuard device_guard_;
+  /// The original streams that were active on all devices.
+  std::vector<CUDAStream> original_streams_;
+};
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp
new file mode 100644
index 0000000000000..4fb945a9920f8
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAStream.cpp
@@ -0,0 +1,198 @@
+#include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/Exceptions.h"
+#include "ATen/Error.h"
+
+#include <mutex>
+#include <atomic>
+
+// Internal implementation is entirely hidden
+struct CUDAStreamInternals {
+  bool is_destructible;
+  std::atomic<int> refcount;
+  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
+  cudaStream_t stream;
+};
+
+namespace at {
+namespace cuda {
+
+namespace detail {
+
+  /*
+  * Stream state
+  */
+  static constexpr cudaStream_t DEFAULT_STREAM = 0;
+
+  static std::once_flag init_flag;
+  static int64_t num_gpus;
+  static CUDAStreamInternals* default_streams;
+  static thread_local CUDAStreamInternals** current_streams = nullptr;
+
+  // Creates a(n indestructible) default stream for each device
+  // Note: the default stream on each device is signified by a zero
+  // value for the pointer, and so is not actually created as usual.
+  // In particular, we don't need to switch devices when creating the
+  // streams.
+  static void initDefaultCUDAStreams() {
+    num_gpus = getNumGPUs();
+    default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals));
+    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
+      default_streams[i].is_destructible = false;
+      default_streams[i].refcount = 0;
+      default_streams[i].device = i;
+      default_streams[i].stream = DEFAULT_STREAM;
+    }
+  }
+
+  // Init front-end to ensure initialization only occurs once
+  static void initCUDAStreamsOnce() {
+    // Inits default streams (once, globally)
+    std::call_once(init_flag, initDefaultCUDAStreams);
+
+    // Inits current streams (thread local) to default streams
+    if (current_streams) return;
+    current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*));
+    for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
+      current_streams[i] = &default_streams[i];
+    }
+  }
+
+  /*
+  * Pointer-based stream API
+  */
+
+  // Helper to verify the GPU index is valid
+  static inline void check_gpu(int64_t device) {
+    AT_ASSERT(device >= 0 && device < num_gpus);
+  }
+
+  CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    return &default_streams[device];
+  }
+  CUDAStreamInternals* CUDAStream_getDefaultStream() {
+    return CUDAStream_getDefaultStreamOnDevice(current_device());
+  }
+
+  // Creates (and retains) and new cuda stream
+  CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) {
+    CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals));
+    internals->is_destructible = true;
+    internals->refcount = 1;
+    internals->device = current_device();
+    #ifndef __HIP_PLATFORM_HCC__
+      AT_CUDA_CHECK(cudaStreamCreateWithPriority(&internals->stream, flags, priority));
+    #else 
+      AT_CUDA_CHECK(cudaStreamCreateWithFlags(&internals->stream, flags));
+    #endif // __HIP_PLATFORM_HCC__
+    return internals;
+  }
+
+  // Note: despite not being "unsafe," is using these methods in a multithreaded
+  // environment then the caller must be sure that streams are valid
+  // when they're requested. These methods will throw an error if an
+  // invalid stream is requested.
+  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    auto cur = current_streams[device];
+    AT_ASSERT(CUDAStream_retain(cur));
+    return cur;
+  }
+  CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() {
+    return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device());
+  }
+
+  // Note: these unsafe methods do not retain the stream before returning it.
+  // This is unsafe behavior and these methods SHOULD NOT BE USED.
+  // They are here only for legacy compatibility.
+  CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    return current_streams[device];
+  }
+  CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() {
+    return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device());
+  }
+
+  void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
+    initCUDAStreamsOnce();
+    check_gpu(device);
+    AT_ASSERT(ptr);
+    AT_ASSERT(ptr->device == device);
+    AT_ASSERT(CUDAStream_retain(ptr));
+
+    CUDAStream_free(current_streams[device]);
+    current_streams[device] = ptr;
+  }
+
+  void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
+    initCUDAStreamsOnce();
+    CUDAStream_uncheckedFree(current_streams[device]);
+    current_streams[device] = ptr;
+  }
+
+  void CUDAStream_setStream(CUDAStreamInternals* ptr) {
+    CUDAStream_setStreamOnDevice(current_device(), ptr);
+  }
+
+  // Getters
+  cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) {
+    AT_ASSERT(ptr);
+    return ptr->stream;
+  }
+
+  int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
+    AT_ASSERT(ptr);
+    return ptr->device;
+  }
+
+  // Memory management
+  // Note: only destructible (non-default) streams are ref counted
+  bool CUDAStream_retain(CUDAStreamInternals* ptr) {
+    AT_ASSERT(ptr);
+    if (ptr->is_destructible) return(++ptr->refcount > 1);
+    return true;
+  }
+
+  void CUDAStream_free(CUDAStreamInternals*& ptr) {
+    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
+      AT_ASSERT(ptr->refcount == 0);
+      AT_CUDA_CHECK(cudaStreamDestroy(ptr->stream));
+      free(ptr);
+      ptr = nullptr;
+    }
+  }
+  void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) {
+    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
+      cudaStreamDestroy(ptr->stream);
+      free(ptr);
+      ptr = nullptr;
+    }
+  }
+
+} // namespace detail
+
+  /*
+  * CUDAStream functions
+  */
+
+   // Copy constructor
+  CUDAStream::CUDAStream(const CUDAStream& other) {
+    AT_ASSERT(other.internals_);
+    AT_ASSERT(detail::CUDAStream_retain(other.internals_));
+
+    internals_ = other.internals_;
+  }
+
+  // Move constructor
+  CUDAStream::CUDAStream(CUDAStream&& other) {
+    AT_ASSERT(other.internals_);
+
+    std::swap(internals_, other.internals_);
+  }
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h
new file mode 100644
index 0000000000000..72fc752b717a4
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAStream.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "cuda_runtime_api.h"
+
+/*
+* A CUDA stream interface with no CUDA build dependency.
+*
+* Includes the CUDAStream RAII class and a pointer-based stream API.
+*
+* The ATen Context interface should be preferred when working with streams.
+*/
+
+// Forward-declares internals
+struct CUDAStreamInternals;
+
+namespace at {
+namespace cuda {
+
+namespace detail {
+
+// Pointer-based API (for internal use)
+// Note: ATen/Context is preferred to work with streams safely
+CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
+CUDAStreamInternals* CUDAStream_getDefaultStream();
+
+CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
+
+CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
+CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
+
+// Note: these Unsafe gets should NEVER be used and are only here for legacy
+// purposes. Once those uses are gone they should be removed.
+CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
+CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
+
+void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
+void CUDAStream_uncheckedSetStreamOnDevice(
+    int64_t device,
+    CUDAStreamInternals* internals);
+void CUDAStream_setStream(CUDAStreamInternals* internals);
+
+cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
+int64_t CUDAStream_device(CUDAStreamInternals*);
+
+bool CUDAStream_retain(CUDAStreamInternals*);
+void CUDAStream_free(CUDAStreamInternals*&);
+void CUDAStream_uncheckedFree(CUDAStreamInternals*&);
+
+} // namespace detail
+
+// RAII for a CUDA stream
+// Allows use as a cudaStream_t, copying, moving, and metadata access.
+struct CUDAStream {
+  // Constants
+  static constexpr int32_t DEFAULT_FLAGS = cudaStreamNonBlocking;
+  static constexpr int32_t DEFAULT_PRIORITY = 0;
+
+  // Constructors
+  CUDAStream() = default;
+  /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false)
+      : internals_{internals} {
+    if (retain) {
+      detail::CUDAStream_retain(internals_);
+    }
+  }
+
+  // Destructor
+  ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); }
+
+  // Copy constructor
+  CUDAStream(const CUDAStream& other);
+
+  // Move constructor
+  CUDAStream(CUDAStream&& other);
+
+  // Assignment operator
+  CUDAStream& operator=(CUDAStream other) noexcept {
+    std::swap(internals_, other.internals_);
+    return *this;
+  }
+
+  // Returns true if the CUDAStream is not null.
+  explicit operator bool() const noexcept {
+    return internals_ != nullptr;
+  }
+
+  // Implicit conversion to cudaStream_t
+  operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAStream& left, const CUDAStream& right) {
+    return left.internals_ < right.internals_;
+  }
+
+  // Getters
+  int64_t device() const { return detail::CUDAStream_device(internals_); }
+  cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); }
+  CUDAStreamInternals* internals() const { return internals_; }
+
+private:
+  CUDAStreamInternals* internals_ = nullptr;
+};
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
new file mode 100644
index 0000000000000..dbacf35079272
--- /dev/null
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "ATen/Error.h"
+
+#define AT_CUDNN_CHECK(STATUS)                                                 \
+  if (STATUS != CUDNN_STATUS_SUCCESS) {                                        \
+    if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) {                                \
+      AT_ERROR(                                                                \
+          "CuDNN error: ",                                                     \
+          cudnnGetErrorString(STATUS),                                         \
+          ". This error may appear if you passed in a non-contiguous input."); \
+    } else {                                                                   \
+      AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS));                  \
+    }                                                                          \
+  }
+
+#define AT_CUDA_CHECK(STATUS)                             \
+  if (STATUS != cudaSuccess) {                            \
+    AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \
+  }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 5872764a905ce..09da44f342b69 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -49,37 +49,12 @@ void unchecked_set_device(int32_t device) {
   (void)return_code;
 }
 
-void cuda_stream_create_with_priority(
-  cudaStream_t* pStream
-, int32_t flags
-, int32_t priority) {
-#ifndef __HIP_PLATFORM_HCC__
-  check_status(cudaStreamCreateWithPriority(pStream, flags, priority));
-#else
-  check_status(cudaStreamCreateWithFlags(pStream, flags));
-#endif
-}
-
-void cuda_stream_destroy(cudaStream_t stream) {
-  check_status(cudaStreamDestroy(stream));
-}
-
-void unchecked_cuda_stream_destroy(cudaStream_t stream) {
-  const auto return_code = cudaStreamDestroy(stream);
-  (void)return_code;
-}
-
 struct DynamicCUDAInterfaceSetter {
   DynamicCUDAInterfaceSetter() {
     using at::detail::DynamicCUDAInterface;
     DynamicCUDAInterface::set_device = set_device;
     DynamicCUDAInterface::get_device = get_device;
     DynamicCUDAInterface::unchecked_set_device = unchecked_set_device;
-    DynamicCUDAInterface::cuda_stream_create_with_priority =
-        cuda_stream_create_with_priority;
-    DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy;
-    DynamicCUDAInterface::unchecked_cuda_stream_destroy =
-        unchecked_cuda_stream_destroy;
   }
 };
 
@@ -123,21 +98,6 @@ bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
 
-#ifndef __HIP_PLATFORM_HCC__
-cusparseHandle_t CUDAHooks::getCurrentCUDASparseHandle(THCState* thc_state) const {
-  return THCState_getCurrentSparseHandle(thc_state);
-}
-#endif
-struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties(
-    THCState* thc_state) const {
-  return THCState_getCurrentDeviceProperties(thc_state);
-}
-struct cudaDeviceProp* CUDAHooks::getDeviceProperties(
-    THCState* thc_state,
-    int device) const {
-  return THCState_getDeviceProperties(thc_state, device);
-}
-
 int64_t CUDAHooks::current_device() const {
   int device;
   cudaError_t err = cudaGetDevice(&device);
@@ -162,7 +122,7 @@ bool CUDAHooks::compiledWithCuDNN() const {
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
   cudaDeviceProp* prop =
-      getCurrentDeviceProperties(globalContext().getTHCState());
+      THCState_getCurrentDeviceProperties(globalContext().getTHCState());
   // NOTE: extra parenthesis around numbers disable clang warnings about
   // dead code
   return (
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index d88ac0d334d5d..eae1a802a5cb0 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -14,11 +14,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   std::unique_ptr<Generator> initCUDAGenerator(Context*) const override;
   bool hasCUDA() const override;
   bool hasCuDNN() const override;
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const override;
-#endif
-  struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override;
-  struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override;
   int64_t current_device() const override;
   Allocator* getPinnedMemoryAllocator() const override;
   void registerCUDATypes(Context*) const override;
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 2bf7f0ae9bc42..085f2723bf045 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "Exceptions.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/Exceptions.h"
 
 #include "cudnn-wrapper.h"
 #include <ATen/ATen.h>
@@ -304,7 +305,7 @@ struct AT_CUDA_API RNNDescriptor
           CUDNN_RNN_ALGO_STANDARD,
           datatype));
 #if CUDNN_VERSION >= 7000 && CUDA_VERSION >= 9000
-    cudaDeviceProp* prop = globalContext().getCurrentDeviceProperties();
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
     if (prop->major >= 7) {
       if (datatype == CUDNN_DATA_HALF) {
         cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
diff --git a/aten/src/ATen/cudnn/Exceptions.h b/aten/src/ATen/cudnn/Exceptions.h
index b59127e64a690..e69de29bb2d1d 100644
--- a/aten/src/ATen/cudnn/Exceptions.h
+++ b/aten/src/ATen/cudnn/Exceptions.h
@@ -1,17 +0,0 @@
-#pragma once
-#include <ATen/Error.h>
-#define AT_CUDNN_CHECK(STATUS)                                                 \
-  if (STATUS != CUDNN_STATUS_SUCCESS) {                                        \
-    if (STATUS == CUDNN_STATUS_NOT_SUPPORTED) {                                \
-      AT_ERROR(                                                                \
-          "CuDNN error: ",                                                     \
-          cudnnGetErrorString(STATUS),                                         \
-          ". This error may appear if you passed in a non-contiguous input."); \
-    } else {                                                                   \
-      AT_ERROR("CuDNN error: ", cudnnGetErrorString(STATUS));                  \
-    }                                                                          \
-  }
-#define AT_CUDA_CHECK(STATUS)                             \
-  if (STATUS != cudaSuccess) {                            \
-    AT_ERROR("CUDA error: ", cudaGetErrorString(STATUS)); \
-  }
diff --git a/aten/src/ATen/cudnn/Handles.cpp b/aten/src/ATen/cudnn/Handles.cpp
index 7aae86d18b0b8..4848d2aca2412 100644
--- a/aten/src/ATen/cudnn/Handles.cpp
+++ b/aten/src/ATen/cudnn/Handles.cpp
@@ -1,6 +1,6 @@
 #include "Handles.h"
 
-#include "Exceptions.h"
+#include "ATen/cuda/Exceptions.h"
 
 #include <unordered_map>
 #include <mutex>
diff --git a/aten/src/ATen/cudnn/Utils.h b/aten/src/ATen/cudnn/Utils.h
index c2e5dcbce70e9..264bf580f8a4d 100644
--- a/aten/src/ATen/cudnn/Utils.h
+++ b/aten/src/ATen/cudnn/Utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include "ATen/cuda/Exceptions.h"
 #include "THC/THC.h"
 #include "cudnn-wrapper.h"
 #include "Handles.h"
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp
index 288b066feafeb..b6c31bed6eda4 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.cpp
+++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp
@@ -27,37 +27,11 @@ void default_unchecked_set_device(int32_t) {
       "before CUDA library was loaded");
 }
 
-void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) {
-  AT_ERROR(
-      "DynamicCUDAInterface::cuda_stream_create_with_priority called "
-      "before CUDA library was loaded");
-}
-
-void default_cuda_stream_destroy(cudaStream_t) {
-  AT_ERROR(
-      "DynamicCUDAInterface::cuda_stream_destroy called "
-      "before CUDA library was loaded");
-}
-
-void default_unchecked_cuda_stream_destroy(cudaStream_t) {
-  AT_ERROR(
-      "DynamicCUDAInterface::unchecked_cuda_stream_destroy called "
-      "before CUDA library was loaded");
-}
-
 // Default the static members of DynamicCUDAInterface.
 void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device;
 void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device;
 void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) =
     default_unchecked_set_device;
-void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(
-    cudaStream_t*,
-    int32_t,
-    int32_t) = default_cuda_stream_create_with_priority;
-void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) =
-    default_cuda_stream_destroy;
-void (*DynamicCUDAInterface::unchecked_cuda_stream_destroy)(cudaStream_t) =
-    default_unchecked_cuda_stream_destroy;
 
 const CUDAHooksInterface& getCUDAHooks() {
   static std::unique_ptr<CUDAHooksInterface> cuda_hooks;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index f0596d01949d8..401cbc76e5471 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -9,28 +9,8 @@
 #include <functional>
 #include <memory>
 
-// Forward declare these CUDA types here to avoid including CUDA headers in
-// ATen headers, which would make ATen always require CUDA to build.
+// Forward-declares THCState
 struct THCState;
-struct cudaDeviceProp;
-struct CUstream_st;
-typedef struct CUstream_st* cudaStream_t;
-
-#ifndef __HIP_PLATFORM_HCC__
-// pyHIPIFY rewrites this as:
-//
-//    struct cusparseContext;
-//    typedef struct cusparseContext *hipsparseHandle_t;
-//
-// however, this forward declaration is wrong
-// the way that the HIP headers define hipsparseHandle_t is
-//
-//    typedef cusparseHandle_t hipsparseHandle_t
-//
-// so the rewrite is wrong.
-struct cusparseContext;
-typedef struct cusparseContext *cusparseHandle_t;
-#endif
 
 namespace at {
 class Context;
@@ -89,21 +69,6 @@ struct AT_API CUDAHooksInterface {
     return false;
   }
 
-#ifndef __HIP_PLATFORM_HCC__
-  virtual cusparseHandle_t getCurrentCUDASparseHandle(THCState*) const {
-    AT_ERROR("Cannot getCurrentCUDASparseHandle() without ATen_cuda library. ", CUDA_HELP);
-  }
-#endif
-
-  virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const {
-    AT_ERROR("Cannot getCurrentDeviceProperties() without ATen_cuda library. ", CUDA_HELP);
-  }
-
-  virtual struct cudaDeviceProp* getDeviceProperties(THCState*, int device)
-      const {
-    AT_ERROR("Cannot getDeviceProperties() without ATen_cuda library. ", CUDA_HELP);
-  }
-
   virtual int64_t current_device() const {
     return -1;
   }
@@ -175,9 +140,6 @@ struct AT_API DynamicCUDAInterface {
   static void (*set_device)(int32_t);
   static void (*get_device)(int32_t*);
   static void (*unchecked_set_device)(int32_t);
-  static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t);
-  static void (*cuda_stream_destroy)(cudaStream_t);
-  static void (*unchecked_cuda_stream_destroy)(cudaStream_t);
 };
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 49c56cbf6e02c..fdc95e546005c 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/Config.h"
 #include "ATen/native/cuda/CuFFTUtils.h"
 #include "ATen/native/utils/ParamsHash.h"
@@ -105,7 +106,7 @@ class CuFFTConfig {
     // contiguous, only need to check real-to-complex case.
     if (input.type().scalarType() == ScalarType::Half) {
       // cuFFT on half requires compute capability of at least SM_53
-      auto dev_prop = at::globalContext().getCurrentDeviceProperties();
+      auto dev_prop = at::cuda::getCurrentDeviceProperties();
       if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) {
         std::ostringstream ss;
         ss << "cuFFT doesn't support signals of half type with compute "
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index affe20d71c791..e3625762d6c25 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -1,7 +1,7 @@
 #include "ATen/ATen.h"
 #include "ATen/TensorUtils.h"
 #include "ATen/Error.h"
-
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/AccumulateType.h"
 
 #include <THC/THCDeviceUtils.cuh>
@@ -217,7 +217,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
   auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options());
 
   int64_t stride = grad_weight.stride(0);
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   if (num_indices <= 768 && !scale_grad_by_freq) {
     auto indices_contig = indices.contiguous();
@@ -332,7 +332,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
   checkDim("embedding_renorm_", self_arg, 2);
   checkSameGPU("embedding_renorm", self_arg, indices_arg);
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
   auto policy = thrust::cuda::par(allocator).on(stream);
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 9169cb0375c55..4f989e3db5c56 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/TensorUtils.h"
 #include "ATen/NativeFunctions.h"
 
@@ -176,7 +177,7 @@ Tensor embedding_bag_backward_cuda_sum_avg(
 
   auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.type());
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   ptrdiff_t numel = indices.numel();
   int64_t stride = grad_weight.stride(0);
@@ -283,7 +284,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
 
   int64_t numBags = grad.size(0);
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   dim3 block = dim3(32, 8);
   int grid = 1024;
@@ -323,7 +324,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
   auto offset2bag =
       at::zeros({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0]
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   auto output = at::zeros({offsets.size(0), weight.size(1)}, weight.options());
 
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index c31d5572bf8a4..cdf0f2f931a05 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -1,4 +1,5 @@
 #include "ATen/Context.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/Dispatch.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/cuda/PinnedMemoryAllocator.h"
@@ -51,7 +52,7 @@ static magma_queue_t createMagmaQueue(const Tensor& tensor) {
   magma_queue_t magma_queue;
   magma_queue_create_from_cuda(
       tensor.get_device(),
-      context.getCurrentCUDAStream(),
+      at::cuda::getCurrentCUDAStream(),
       THCState_getCurrentBlasHandle(context.getTHCState()),
       THCState_getCurrentSparseHandle(context.getTHCState()),
       &magma_queue);
diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu
index 63f1f26a979f9..ef2fba982ad10 100644
--- a/aten/src/ATen/native/cuda/RoiPooling.cu
+++ b/aten/src/ATen/native/cuda/RoiPooling.cu
@@ -1,6 +1,7 @@
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/Error.h"
+#include "ATen/cuda/CUDAContext.h"
 
 #include <cfloat>
 #include <tuple>
@@ -134,7 +135,7 @@ std::tuple<Tensor, Tensor> RoiPooling2d_forward_cuda(
 
   dim3 block(512);
   dim3 grid((output.numel() + 512 - 1) / 512);
-  RoiPooling2d_forward_kernel<<<grid, block, 0, globalContext().getCurrentCUDAStream()>>>(
+  RoiPooling2d_forward_kernel<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
     output.numel(), input.data<float>(), rois.data<float>(), static_cast<float>(spatialScale), inputChannels,
     inputHeight, inputWidth, pooledHeight, pooledWidth, output.data<float>(), argmaxes.data<int>());
   AT_CHECK(cudaGetLastError() == cudaSuccess, "RoiPooling2d_forward_kernel failed with error code ", cudaGetLastError());
@@ -201,7 +202,7 @@ Tensor RoiPooling2d_backward_cuda(
 
   dim3 block(512);
   dim3 grid((gradInput.numel() + 512 - 1) / 512);
-  RoiPooling2d_backward_kernel<<<grid, block, 0, globalContext().getCurrentCUDAStream()>>>(
+  RoiPooling2d_backward_kernel<<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
     gradOutput.numel(), gradOutput.data<float>(), argmaxes.data<int>(), proposals,
     static_cast<float>(spatialScale), inputChannels, inputHeight, inputWidth,
     pooledHeight, pooledWidth, gradInput.data<float>(), rois.data<float>());
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 0ee5d18d1e2bd..8599d17611bd7 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/TensorUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtils.h"
@@ -124,7 +125,7 @@ void SpatialSoftMax_getLaunchSizes(
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
                                                 k, block_threads, smem_size);
 #endif
-  max_active_blocks *= at::globalContext().getCurrentDeviceProperties()->multiProcessorCount;
+  max_active_blocks *= at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
   grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, dim_size, inner_size);
 }
 
@@ -483,7 +484,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_){
   int64_t outer_size = 1;
   int64_t dim_size = input.size(dim);
   int64_t inner_size = 1;
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   for (int64_t i = 0; i < dim; ++i)
     outer_size *= input.size(i);
   for (int64_t i = dim + 1; i < input.dim(); ++i)
@@ -540,7 +541,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t
   for (int64_t i = dim + 1; i < output.dim(); ++i)
     inner_size *= output.size(i);
 // See descriptions of kernels above.
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   if (inner_size == 1) {
     const int ILP = 2;
     dim3 grid(outer_size);
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 7266ebdd998d5..c2ad676c56329 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/Config.h"
 #include "ATen/Dispatch.h"
 #include "ATen/Utils.h"
@@ -104,7 +105,7 @@ static void _fft_fill_with_conjugate_symmetry_(Tensor& input,
   // copy
   int64_t n = input.numel() / size_last_dim * (size_last_dim - last_dim_start_slice);
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
   auto policy = thrust::cuda::par(allocator).on(stream);
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "_fft_fill_with_conjugate_symmetry_", [&] {
@@ -180,7 +181,7 @@ static inline Tensor _run_cufft(
   auto output = input.type().tensor(output_sizes);
 
   // set to current stream
-  CUFFT_CHECK(cufftSetStream(plan, ctx.getCurrentCUDAStream()));
+  CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
 
   auto ws = ctx.getType(at::Backend::CUDA, at::ScalarType::Byte).tensor({ config.workspace_size() });
   CUFFT_CHECK(cufftSetWorkArea(plan, ws.data_ptr()));
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 46c812cc91617..c55fc0262a178 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -1,4 +1,5 @@
 #include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAApplyUtils.cuh"
 
 namespace at {
@@ -116,7 +117,7 @@ __global__ void kernelHistogram1D(
       <<<grid,                                                             \
          block,                                                            \
          (MEMORY_TYPE == CUDAHistogramMemoryType::SHARED) ? sharedMem : 0, \
-         at::globalContext().getCurrentCUDAStream()>>>(                    \
+         getCurrentCUDAStream()>>>(                    \
           aInfo, pInfo, bInfo, binsize, totalElements, WEIGHTS_OP);        \
   AT_ASSERTM(cudaGetLastError() == cudaSuccess, "kernelHistogram1D failed");
 
@@ -184,8 +185,7 @@ bool CUDA_tensor_histogram(
   }
 
   CUDAHistogramMemoryType memType = CUDAHistogramMemoryType::GLOBAL;
-  auto maxSharedMem =
-      at::globalContext().getCurrentDeviceProperties()->sharedMemPerBlock;
+  auto maxSharedMem = getCurrentDeviceProperties()->sharedMemPerBlock;
   auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes
   auto maxGlobalMem = getFreeGlobalMemory();
   auto multiBlockMem = nbins * grid.x * sizeof(output_t) + 8; // 8 guard bytes
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index cc8e78c292dbc..c1c0e943fde7c 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -2,6 +2,7 @@
 
 #include "ATen/cuda/detail/IndexUtils.cuh"
 #include "ATen/NativeFunctions.h"
+#include "ATen/cuda/CUDAContext.h"
 
 #include <cstddef>
 #include <vector>
@@ -83,7 +84,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
       int flip_dim = in_tensor_info.collapseDims(dims[0]);
       out_tensor_info.collapseDims(dims[0]);
       kernel_pointwise_flip_apply2<scalar_t, int64_t>
-        <<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+        <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
           in_tensor_info, out_tensor_info, N, flip_dim, total_dims);
     });
     return out_tensor;
@@ -113,7 +114,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
   }
 
   AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
-    flip_cuda_kernel<<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+    flip_cuda_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
       in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size,
       strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
   });
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index 32dc7d35380bd..f2e13b4c708b6 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -1,5 +1,5 @@
 #include "ATen/ATen.h"
-
+#include "ATen/cuda/CUDAContext.h"
 #include <THC/THCGeneral.h>
 #include <THC/THCThrustAllocator.cuh>
 #include <thrust/execution_policy.h>
@@ -37,7 +37,7 @@ template <typename scalar_t>
     const Tensor& self,
     const bool return_inverse) {
 
-    cudaStream_t stream = globalContext().getCurrentCUDAStream();
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
     auto policy = thrust::cuda::par(allocator).on(stream);
 
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index f73a2ad0dfd90..7f0a0f86524c6 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -30,6 +30,7 @@ Tensor cudnn_affine_grid_generator_backward(
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
+#include <ATen/cuda/Exceptions.h>
 
 #include <ATen/TensorUtils.h>
 
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 9b2a25670d4b3..7cd7466a28503 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -31,6 +31,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
+#include <ATen/cuda/Exceptions.h>
 
 #include <ATen/TensorUtils.h>
 
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index b3ee01603b24f..a2be3e507cbd0 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -2,6 +2,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Config.h>
 #include <ATen/cuda/CUDAConfig.h>
+#include <ATen/cuda/Exceptions.h>
 
 #if !AT_CUDNN_ENABLED()
 
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index c6b7ffc1cb6a0..1ce92cf7e18d2 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -27,6 +27,7 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
+#include <ATen/cuda/Exceptions.h>
 
 #include <ATen/TensorUtils.h>
 
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index aced0a0ff3ab9..63f0d7a29578f 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -5,6 +5,7 @@
 #include <ATen/MatrixRef.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/cuda/CUDAConfig.h>
+#include <ATen/cuda/Exceptions.h>
 
 #if !AT_CUDNN_ENABLED()
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
index 0ed53befb27d0..25d639f8bd828 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
@@ -1,6 +1,7 @@
 #include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
 #include <ATen/Error.h>
 #include <ATen/Context.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <TH/THGeneral.h>
 
@@ -60,8 +61,8 @@ inline void CUSPARSE_CHECK(cusparseStatus_t status)
 }
 
 inline cusparseHandle_t setCUDASparseStream() {
-  cusparseHandle_t handle = globalContext().getCurrentCUDASparseHandle();
-  cusparseSetStream(handle, globalContext().getCurrentCUDAStream());
+  cusparseHandle_t handle = at::cuda::getCurrentCUDASparseHandle();
+  cusparseSetStream(handle, at::cuda::getCurrentCUDAStream());
   return handle;
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index a12edc9a77423..b4617f23eb6cc 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/sparse/SparseUtils.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
@@ -33,7 +34,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
     return self;
   }
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
   auto policy = thrust::cuda::par(allocator).on(stream);
   // Replace instances with
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 3521fc340ac00..ddeae3bd5707a 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/sparse/SparseUtils.h>
 #include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
@@ -197,7 +198,7 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
 
   _get_sparse_impl(r_)->raw_resize_(1, 1, {m, n});
 
-  cudaStream_t stream = globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
   auto policy = thrust::cuda::par(allocator).on(stream);
 
@@ -282,7 +283,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     dim3 grid;
     int curDevice = -1;
     cudaGetDevice(&curDevice);
-    cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStreamOnDevice(curDevice);
     if (sparse._denseDims() == 0) {
       AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
@@ -482,7 +483,7 @@ SparseTensor& s_mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, co
   dim3 grid;
   int curDevice = -1;
   cudaGetDevice(&curDevice);
-  cudaStream_t stream = globalContext().getCurrentCUDAStreamOnDevice(curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStreamOnDevice(curDevice);
   AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
 
   LongTensor resultNnz = at::empty({1}, CUDA(kLong));
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 1f877e3e8b598..ba645de1f17ef 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -1,7 +1,8 @@
 #define CATCH_CONFIG_MAIN
 #include "catch.hpp"
 
-#include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAGuard.h"
 
 #include "cuda_runtime.h"
 
@@ -18,9 +19,9 @@ TEST_CASE(
   cudaStream_t cuda_stream;
 
   // Tests that copying works as expected and preserves the stream
-  at::CUDAStream copyStream;
+  at::cuda::CUDAStream copyStream;
   {
-    auto s = at::globalContext().createCUDAStream();
+    auto s = at::cuda::createCUDAStream();
     device = s.device();
     cuda_stream = s.stream();
 
@@ -36,9 +37,9 @@ TEST_CASE(
   REQUIRE(copyStream.stream() == cuda_stream);
 
   // Tests that moving works as expected and preserves the stream
-  at::CUDAStream moveStream;
+  at::cuda::CUDAStream moveStream;
   {
-    auto s = at::globalContext().createCUDAStream();
+    auto s = at::cuda::createCUDAStream();
     device = s.device();
     cuda_stream = s.stream();
 
@@ -54,50 +55,50 @@ TEST_CASE(
 }
 
 TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
-  at::CUDAStream myStream = at::globalContext().createCUDAStream();
+  at::cuda::CUDAStream myStream = at::cuda::createCUDAStream();
 
   // Sets and gets
-  at::globalContext().setCurrentCUDAStream(myStream);
-  at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream();
+  at::cuda::setCurrentCUDAStream(myStream);
+  at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
 
   REQUIRE(myStream == curStream);
 
   // Gets, sets, and gets default stream
-  at::CUDAStream defaultStream = at::globalContext().getDefaultCUDAStream();
-  at::globalContext().setCurrentCUDAStream(defaultStream);
-  curStream = at::globalContext().getCurrentCUDAStream();
+  at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
+  at::cuda::setCurrentCUDAStream(defaultStream);
+  curStream = at::cuda::getCurrentCUDAStream();
 
   REQUIRE(defaultStream != myStream);
   REQUIRE(curStream == defaultStream);
 }
 
 TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") {
-  auto ptr = at::detail::CUDAStream_createAndRetainWithOptions(
-      at::CUDAStream::DEFAULT_FLAGS, at::CUDAStream::DEFAULT_PRIORITY);
+  auto ptr = at::cuda::detail::CUDAStream_createAndRetainWithOptions(
+      at::cuda::CUDAStream::DEFAULT_FLAGS, at::cuda::CUDAStream::DEFAULT_PRIORITY);
 
-  at::detail::CUDAStream_free(ptr);
+  at::cuda::detail::CUDAStream_free(ptr);
   REQUIRE(ptr == nullptr);
 }
 
-void thread_fun(at::CUDAStream& cur_thread_stream) {
-  auto new_stream = at::globalContext().createCUDAStream();
-  at::globalContext().setCurrentCUDAStream(new_stream);
-  cur_thread_stream = at::globalContext().getCurrentCUDAStream();
+void thread_fun(at::cuda::CUDAStream& cur_thread_stream) {
+  auto new_stream = at::cuda::createCUDAStream();
+  at::cuda::setCurrentCUDAStream(new_stream);
+  cur_thread_stream = at::cuda::getCurrentCUDAStream();
   REQUIRE(cur_thread_stream == new_stream);
 }
 
 TEST_CASE(
     "Multithread Getting and Setting",
     "Ensures streams are thread local") {
-  at::CUDAStream s0, s1;
+  at::cuda::CUDAStream s0, s1;
 
   std::thread t0{thread_fun, std::ref(s0)};
   std::thread t1{thread_fun, std::ref(s1)};
   t0.join();
   t1.join();
 
-  at::CUDAStream cur_stream = at::globalContext().getCurrentCUDAStream();
-  at::CUDAStream default_stream = at::globalContext().getDefaultCUDAStream();
+  at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
+  at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
 
   REQUIRE(cur_stream == default_stream);
   REQUIRE(cur_stream != s0);
@@ -106,94 +107,94 @@ TEST_CASE(
 }
 
 TEST_CASE("CUDAGuard") {
-  if (at::globalContext().getNumGPUs() < 2) {
+  if (at::cuda::getNumGPUs() < 2) {
     return;
   }
 
   // -- begin setup
 
-  REQUIRE(at::current_device() == 0);
-  std::vector<at::CUDAStream> streams0 = {
-      at::globalContext().getDefaultCUDAStream(),
-      at::globalContext().createCUDAStream()};
+  REQUIRE(at::cuda::current_device() == 0);
+  std::vector<at::cuda::CUDAStream> streams0 = {
+      at::cuda::getDefaultCUDAStream(),
+      at::cuda::createCUDAStream()};
   REQUIRE(streams0[0].device() == 0);
   REQUIRE(streams0[1].device() == 0);
-  at::globalContext().setCurrentCUDAStreamOnDevice(0, streams0[0]);
+  at::cuda::setCurrentCUDAStreamOnDevice(0, streams0[0]);
 
-  std::vector<at::CUDAStream> streams1;
+  std::vector<at::cuda::CUDAStream> streams1;
   {
     at::DeviceGuard device_guard(1);
-    streams1.push_back(at::globalContext().getDefaultCUDAStream());
-    streams1.push_back(at::globalContext().createCUDAStream());
+    streams1.push_back(at::cuda::getDefaultCUDAStream());
+    streams1.push_back(at::cuda::createCUDAStream());
   }
   REQUIRE(streams1[0].device() == 1);
   REQUIRE(streams1[1].device() == 1);
-  at::globalContext().setCurrentCUDAStreamOnDevice(1, streams1[0]);
+  at::cuda::setCurrentCUDAStreamOnDevice(1, streams1[0]);
 
-  REQUIRE(at::current_device() == 0);
+  REQUIRE(at::cuda::current_device() == 0);
 
   // -- end setup
 
   // Test that all original streams are recorded.
   {
-    at::CUDAGuard guard;
+    at::cuda::CUDAGuard guard;
     REQUIRE(guard.original_streams().empty());
     guard.set_stream(streams0[0]);
     REQUIRE(
-        guard.original_streams().size() == at::globalContext().getNumGPUs());
+        guard.original_streams().size() == at::cuda::getNumGPUs());
     REQUIRE(guard.original_streams()[0] == streams0[0]);
     REQUIRE(guard.original_streams()[1] == streams1[0]);
   }
 
   // Setting a stream changes the current device and the stream on that device
   {
-    at::CUDAGuard guard(streams1[1]);
+    at::cuda::CUDAGuard guard(streams1[1]);
     REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::current_device() == 1);
-    REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[1]);
+    REQUIRE(at::cuda::current_device() == 1);
+    REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[1]);
   }
 
   // Device and stream are now reset
-  REQUIRE(at::current_device() == 0);
-  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+  REQUIRE(at::cuda::current_device() == 0);
+  REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]);
 
   // Setting only the device changes only the current device and not the stream
   {
-    at::CUDAGuard guard(/*device=*/1);
+    at::cuda::CUDAGuard guard(/*device=*/1);
     REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::current_device() == 1);
-    REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+    REQUIRE(at::cuda::current_device() == 1);
+    REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]);
   }
 
-  REQUIRE(at::current_device() == 0);
-  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]);
+  REQUIRE(at::cuda::current_device() == 0);
+  REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(0) == streams0[0]);
 
   // Setting the stream first, and then the device, first changes the devices
   // back, and then resets the stream on the initial device.
 
   {
-    at::CUDAGuard guard(streams0[1]);
+    at::cuda::CUDAGuard guard(streams0[1]);
     guard.set_device(1);
   }
 
-  REQUIRE(at::current_device() == 0);
-  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]);
-  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+  REQUIRE(at::cuda::current_device() == 0);
+  REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(0) == streams0[0]);
+  REQUIRE(at::cuda::getCurrentCUDAStreamOnDevice(1) == streams1[0]);
 }
 
 TEST_CASE("CUDAGuardIsMovable") {
-  if (at::globalContext().getNumGPUs() < 2) {
+  if (at::cuda::getNumGPUs() < 2) {
     return;
   }
-  const auto stream = at::globalContext().createCUDAStream();
-  const auto device_count = at::globalContext().getNumGPUs();
-  at::CUDAGuard first(stream);
+  const auto stream = at::cuda::createCUDAStream();
+  const auto device_count = at::cuda::getNumGPUs();
+  at::cuda::CUDAGuard first(stream);
   first.set_device(1);
-  at::CUDAGuard second(std::move(first));
+  at::cuda::CUDAGuard second(std::move(first));
   REQUIRE(second.original_streams().size() == device_count);
   REQUIRE(second.original_device() == 0);
   REQUIRE(second.last_device() == 1);
-  at::CUDAGuard third;
+  at::cuda::CUDAGuard third;
   third = std::move(second);
   REQUIRE(third.original_streams().size() == device_count);
   REQUIRE(third.original_device() == 0);
diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp
index 7d400a296102f..b63e47d86eac7 100644
--- a/aten/src/THC/THCCachingAllocator.cpp
+++ b/aten/src/THC/THCCachingAllocator.cpp
@@ -1,7 +1,8 @@
 #include "THCCachingAllocator.h"
 
 #include <ATen/Context.h>
-#include <ATen/cudnn/Exceptions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
 
 #include <cuda_runtime_api.h>
 #include <algorithm>
@@ -507,7 +508,7 @@ struct CudaCachingAllocator : public at::Allocator {
     THCudaCheck(cudaGetDevice(&device));
     void* r = nullptr;
     if (size != 0) {
-      AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::globalContext().getCurrentCUDAStreamOnDevice(device)));
+      AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::cuda::getCurrentCUDAStreamOnDevice(device)));
     }
     return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)};
   }
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index bde2c393a6619..7449fcfe2df92 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -6,7 +6,7 @@
 #include "THCTensorRandom.h"
 #include "THCGeneral.hpp"
 
-#include "ATen/CUDAStream.h"
+#include "ATen/cuda/CUDAStream.h"
 
 #include "THCCachingAllocator.h"
 #include <stdlib.h>
@@ -426,29 +426,29 @@ cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int
 }
 
 THCStream* THCState_getStreamOnDevice(THCState* state, int device) {
-  return at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device);
+  return at::cuda::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device);
 }
 
 void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream) {
-  at::detail::CUDAStream_setStreamOnDevice(device, stream);
+  at::cuda::detail::CUDAStream_setStreamOnDevice(device, stream);
 }
 
 cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device) {
-  return at::detail::CUDAStream_stream(
-    at::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device));
+  return at::cuda::detail::CUDAStream_stream(
+    at::cuda::detail::CUDAStream_getCurrentStreamOnDeviceUnsafe(device));
 }
 
 cudaStream_t THCState_getCurrentStream(THCState *state) {
-  return at::detail::CUDAStream_stream(
-    at::detail::CUDAStream_getCurrentStreamUnsafe());
+  return at::cuda::detail::CUDAStream_stream(
+    at::cuda::detail::CUDAStream_getCurrentStreamUnsafe());
 }
 
 THCStream* THCState_getStream(THCState *state) {
-  return at::detail::CUDAStream_getCurrentStreamUnsafe();
+  return at::cuda::detail::CUDAStream_getCurrentStreamUnsafe();
 }
 
 void THCState_setStream(THCState *state, THCStream *stream) {
-  at::detail::CUDAStream_setStream(stream);
+  at::cuda::detail::CUDAStream_setStream(stream);
 }
 
 cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
diff --git a/aten/src/THC/THCStream.cpp b/aten/src/THC/THCStream.cpp
index 01fc9b00274bc..b997c380c6606 100644
--- a/aten/src/THC/THCStream.cpp
+++ b/aten/src/THC/THCStream.cpp
@@ -1,32 +1,32 @@
 #include "THCStream.h"
-#include "ATen/CUDAStream.h"
+#include "ATen/cuda/CUDAStream.h"
 
 THC_API THCStream* THCStream_defaultStream(int device) {
-  return at::detail::CUDAStream_getDefaultStreamOnDevice(device);
+  return at::cuda::detail::CUDAStream_getDefaultStreamOnDevice(device);
 }
 
 THC_API THCStream* THCStream_new(int flags) { 
-  return THCStream_newWithPriority(flags, at::CUDAStream::DEFAULT_PRIORITY);
+  return THCStream_newWithPriority(flags, at::cuda::CUDAStream::DEFAULT_PRIORITY);
 }
 
 THC_API THCStream* THCStream_newWithPriority(int flags, int priority) {
-  return at::detail::CUDAStream_createAndRetainWithOptions(flags, priority);
+  return at::cuda::detail::CUDAStream_createAndRetainWithOptions(flags, priority);
 }
 
 THC_API cudaStream_t THCStream_stream(THCStream* stream) {
-  return at::detail::CUDAStream_stream(stream);
+  return at::cuda::detail::CUDAStream_stream(stream);
 }
 
 THC_API int THCStream_device(THCStream* stream) { 
-  return at::detail::CUDAStream_device(stream);
+  return at::cuda::detail::CUDAStream_device(stream);
 }
 
 THC_API void THCStream_retain(THCStream* stream) {
-  at::detail::CUDAStream_retain(stream); 
+  at::cuda::detail::CUDAStream_retain(stream); 
 }
 
 THC_API void THCStream_free(THCStream* stream) { 
-  at::detail::CUDAStream_free(stream); 
+  at::cuda::detail::CUDAStream_free(stream); 
 }
 
 
diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
index 0a7bc0f9d8f8a..dbb662425922d 100644
--- a/test/cpp_extensions/cudnn_extension.cpp
+++ b/test/cpp_extensions/cudnn_extension.cpp
@@ -13,7 +13,7 @@
 #include <torch/torch.h>
 
 #include <ATen/cudnn/Descriptors.h> // for TensorDescriptor
-#include <ATen/cudnn/Exceptions.h> // for CUDNN_CHECK
+#include <ATen/cuda/Exceptions.h> // for CUDNN_CHECK
 #include <ATen/cudnn/Handles.h> // for getCudnnHandle
 
 // Name of function in python module and name used for error messages by
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
index 00e140e81b083..052aba90dcc4f 100644
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -19,7 +19,7 @@ Scatter::Scatter(
     std::vector<at::Device> devices,
     const at::optional<std::vector<int64_t>>& chunk_sizes,
     int64_t dim,
-    const at::optional<std::vector<at::CUDAStream>>& streams,
+    const at::optional<std::vector<at::cuda::CUDAStream>>& streams,
     bool unsqueeze_scalars)
     : devices_(std::move(devices)),
       chunk_sizes_(chunk_sizes),
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
index 7bbd24a169dcb..a7c24748ca01f 100644
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -16,7 +16,7 @@ struct Scatter : public Function {
       std::vector<at::Device> devices,
       const at::optional<std::vector<int64_t>>& chunk_sizes = at::nullopt,
       int64_t dim = 0,
-      const at::optional<std::vector<at::CUDAStream>>& streams = at::nullopt,
+      const at::optional<std::vector<at::cuda::CUDAStream>>& streams = at::nullopt,
       bool unsqueeze_scalars = false);
 
   variable_list apply(variable_list&& inputs) override;
@@ -24,7 +24,7 @@ struct Scatter : public Function {
   std::vector<at::Device> devices_;
   at::optional<std::vector<int64_t>> chunk_sizes_;
   int64_t dim_;
-  at::optional<std::vector<at::CUDAStream>> streams_;
+  at::optional<std::vector<at::cuda::CUDAStream>> streams_;
   bool unsqueeze_scalars_;
 };
 
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 3df34c728844b..c3abdfc3b8f2b 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -17,6 +17,7 @@
 #include "ATen/ATen.h"
 #include "torch/csrc/cuda/cuda_check.h"
 #ifdef USE_CUDA
+#include "ATen/cuda/CUDAContext.h"
 #include <cuda_runtime.h>
 #endif
 
@@ -51,7 +52,7 @@ struct Event {
     if(record_cuda) {
       TORCH_CUDA_CHECK(cudaGetDevice(&device_));
       TORCH_CUDA_CHECK(cudaEventCreate(&event));
-      auto stream = at::globalContext().getCurrentCUDAStream();
+      auto stream = at::cuda::getCurrentCUDAStream();
       cpu_ns_ = getTime();
       TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
     } else {
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index c4372059f3ae7..30649f7f04bcc 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -7,6 +7,7 @@
 #include <sstream>
 #include <TH/TH.h>
 #include <ATen/ATen.h>
+#include "ATen/cuda/CUDAContext.h"
 #include <THC/THCCachingAllocator.h>
 #ifdef USE_NCCL
 #include <nccl.h>
@@ -309,7 +310,7 @@ static void bindCudaDeviceProperties(PyObject* module) {
       return stream.str();
     });
   m.def("_get_device_properties", [](int device) -> cudaDeviceProp * {
-    return at::globalContext().getDeviceProperties(device);
+    return at::cuda::getDeviceProperties(device);
   }, py::return_value_policy::reference);
 }
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 52a27ea0a4473..9c3b8a305a5ae 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -8,6 +8,8 @@
 #endif
 
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/optional.h>
 
 #include <cstddef>
@@ -118,7 +120,7 @@ std::vector<at::Tensor> scatter(
     at::IntList devices,
     const at::optional<std::vector<int64_t>>& chunk_sizes,
     int64_t dim,
-    const at::optional<std::vector<at::CUDAStream>>& streams) {
+    const at::optional<std::vector<at::cuda::CUDAStream>>& streams) {
   std::vector<at::Tensor> chunks;
   if (chunk_sizes) {
     const int64_t chunk_size_sum =
@@ -140,7 +142,7 @@ std::vector<at::Tensor> scatter(
   } else {
     chunks = tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim);
   }
-  at::CUDAGuard cuda_guard;
+  at::cuda::CUDAGuard cuda_guard;
   for (size_t chunk = 0; chunk < chunks.size(); ++chunk) {
     const auto device_index = static_cast<int32_t>(devices[chunk]);
     if (streams) {
@@ -150,7 +152,7 @@ std::vector<at::Tensor> scatter(
           chunk, " (was ", (*streams)[chunk].device(), ") ",
           "to match the device supplied at that index ",
           "(expected ", device_index, ")");
-      cuda_guard.set_stream((*streams)[chunk]);
+      cuda_guard.set_stream(at::cuda::CUDAStream((*streams)[chunk]));
     }
     chunks[chunk] = chunks[chunk].contiguous().to(
         {at::kCUDA, device_index}, /*non_blocking=*/true);
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h
index c7009a56e9712..caf7811afd1ae 100644
--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <cstddef>
 #include <vector>
@@ -19,7 +20,7 @@ std::vector<at::Tensor> scatter(
     at::IntList devices,
     const at::optional<std::vector<int64_t>>& chunk_sizes = at::nullopt,
     int64_t dim = 0,
-    const at::optional<std::vector<at::CUDAStream>>& streams = at::nullopt);
+    const at::optional<std::vector<at::cuda::CUDAStream>>& streams = at::nullopt);
 
 at::Tensor gather(
     at::TensorList tensors,
diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp
index 0ec849a749854..c9ed0bb3be434 100644
--- a/torch/csrc/cuda/python_comm.cpp
+++ b/torch/csrc/cuda/python_comm.cpp
@@ -28,14 +28,14 @@ void initCommMethods(PyObject *module) {
      at::optional<std::vector<int64_t>> chunk_sizes,
      int64_t dim,
      at::optional<py::object> py_streams) {
-     at::optional<std::vector<at::CUDAStream>> streams;
+     at::optional<std::vector<at::cuda::CUDAStream>> streams;
      if (py_streams) {
        py::handle handle = *py_streams;
        streams = fmap(
            THPUtils_PySequence_to_THCStreamList(handle.ptr()),
            [](THCStream* stream) {
-             at::detail::CUDAStream_retain(stream);
-             return at::CUDAStream(stream);
+             at::cuda::detail::CUDAStream_retain(stream);
+             return at::cuda::CUDAStream(stream);
            });
      }
      // Note: We're holding the GIL up to here.
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 889a8c3fcfacc..3e04369987eaf 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -9,6 +9,7 @@
 
 #include "ATen/ATen.h"
 #ifdef USE_CUDA
+#include "ATen/cuda/CUDAContext.h"
 #include "THC/THC.h"
 #include "torch/csrc/cuda/cuda_check.h"
 #include <nvrtc.h>
@@ -651,7 +652,7 @@ struct CUDAFusionFunction : public CompiledFusionFunction {
             *(THCCachingAllocator_getCudaFreeMutex()));
         cudaFree(0);
      }
-     CUstream stream = at::globalContext().getCurrentCUDAStream();
+     CUstream stream = at::cuda::getCurrentCUDAStream();
      TORCH_CU_CHECK(cuLaunchKernel(
        function,
        numBlocks, 1, 1,

From 01581037dca1176508f4544b377a4c0f2d0dd165 Mon Sep 17 00:00:00 2001
From: Kittipat Virochsiri <kittipat@fb.com>
Date: Fri, 20 Jul 2018 14:49:16 -0700
Subject: [PATCH 11/23] Add workspace.RunPlanInBackground (#9637)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9637

Adding a method to run plan in background. The intended use is to run BlueWhale's data reading & preprocessing net in background while the GPU is training.

Reviewed By: MisterTea

Differential Revision: D8906439

fbshipit-source-id: b1c73ca7327e2d87a8f873924e05ab3d161a3f1e
---
 caffe2/python/pybind_state.cc   | 46 +++++++++++++++++++++++++++++++++
 caffe2/python/workspace.py      |  8 ++++++
 caffe2/python/workspace_test.py |  9 +++++++
 3 files changed, 63 insertions(+)

diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 7c421ff2a870e..dca1cdf0fbf2f 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -1,5 +1,8 @@
 #include "pybind_state.h"
 
+#include <chrono>
+#include <future>
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -245,6 +248,33 @@ OPERATOR_SCHEMA(PythonDLPackGradient).AllowInplace([](int, int) {
 });
 REGISTER_GRADIENT(PythonDLPack, GetPythonGradient);
 
+class BackgroundPlan {
+ public:
+  BackgroundPlan(Workspace* ws, PlanDef def) : ws_(ws), def_(def) {}
+
+  void run() {
+    fut_ =
+        std::async(std::launch::async, [this]() { return ws_->RunPlan(def_); });
+  }
+
+  bool isDone() {
+    CAFFE_ENFORCE(fut_.valid());
+    auto status = fut_.wait_for(std::chrono::milliseconds(0));
+    return status == std::future_status::ready;
+  }
+
+  bool isSucceeded() {
+    CAFFE_ENFORCE(isDone());
+    return fut_.get();
+  }
+
+ private:
+  Workspace* ws_;
+  PlanDef def_;
+
+  std::future<bool> fut_;
+};
+
 void addObjectMethods(py::module& m) {
   py::class_<NetBase>(m, "Net").def("run", [](NetBase* net) {
     py::gil_scoped_release g;
@@ -505,6 +535,11 @@ void addObjectMethods(py::module& m) {
         return py::cast(ws->second.get(), py::return_value_policy::reference);
       });
 
+  py::class_<BackgroundPlan, std::shared_ptr<BackgroundPlan>>(
+      m, "BackgroundPlan")
+      .def("is_done", &BackgroundPlan::isDone)
+      .def("is_succeeded", &BackgroundPlan::isSucceeded);
+
   // Gradients
   py::class_<GradientWrapper>(m, "GradientWrapper")
       .def(py::init<>())
@@ -1183,6 +1218,17 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(gWorkspace->RunPlan(def));
     return true;
   });
+  m.def("run_plan_in_background", [](const py::bytes& plan_def) {
+    CAFFE_ENFORCE(gWorkspace);
+    PlanDef def;
+    CAFFE_ENFORCE(
+        ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
+    py::gil_scoped_release g;
+
+    auto background_plan = std::make_shared<BackgroundPlan>(gWorkspace, def);
+    background_plan->run();
+    return background_plan;
+  });
   m.def(
       "apply_transform",
       [](const string& transform_key, const py::bytes& net_def) {
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 1c618ac9efabd..8c3c1729db232 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -228,6 +228,14 @@ def RunPlan(plan_or_step):
     return C.run_plan(StringifyProto(plan_or_step))
 
 
+def RunPlanInBackground(plan_or_step):
+    # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
+    import caffe2.python.core as core
+    if isinstance(plan_or_step, core.ExecutionStep):
+        plan_or_step = core.Plan(plan_or_step)
+    return C.run_plan_in_background(StringifyProto(plan_or_step))
+
+
 def InferShapesAndTypes(nets, blob_dimensions=None, nets_proto=False):
     """Infers the shapes and types for the specified nets.
 
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 78468ec8548af..5da37c7f22efc 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -83,6 +83,15 @@ def testRunPlan(self):
             workspace.RunPlan(plan.Proto().SerializeToString()), True)
         self.assertEqual(workspace.HasBlob("testblob"), True)
 
+    def testRunPlanInBackground(self):
+        plan = core.Plan("test-plan")
+        plan.AddStep(core.ExecutionStep("test-step", self.net))
+        background_plan = workspace.RunPlanInBackground(plan)
+        while not background_plan.is_done():
+            pass
+        self.assertEqual(background_plan.is_succeeded(), True)
+        self.assertEqual(workspace.HasBlob("testblob"), True)
+
     def testConstructPlanFromSteps(self):
         step = core.ExecutionStep("test-step-as-plan", self.net)
         self.assertEqual(workspace.RunPlan(step), True)

From 45e5c17ecfbe00708e2c61488d688b3d651ab8ca Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@fb.com>
Date: Fri, 20 Jul 2018 15:08:02 -0700
Subject: [PATCH 12/23] ONNXIFI transform (#9569)

Summary:
Cut-off runnable subgraph and off-load to ONNXIFI backend
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9569

Reviewed By: Maratyszcza

Differential Revision: D8930408

Pulled By: yinghai

fbshipit-source-id: 2b494f7f8dc10c00e58cf0fed5c4a9434be6155b
---
 caffe2/operators/onnxifi_op.cc     |   4 +
 caffe2/operators/onnxifi_op.h      |   5 -
 caffe2/opt/onnxifi_transformer.cc  | 375 +++++++++++++++++++++++++++++
 caffe2/opt/onnxifi_transformer.h   |  63 +++++
 caffe2/python/onnx/onnxifi.py      |  61 +++++
 caffe2/python/onnx/test_onnxifi.py | 156 ++++++++++++
 caffe2/python/pybind_state.cc      |  22 ++
 caffe2/python/trt/transform.py     |  13 +-
 8 files changed, 689 insertions(+), 10 deletions(-)
 create mode 100644 caffe2/opt/onnxifi_transformer.cc
 create mode 100644 caffe2/opt/onnxifi_transformer.h
 create mode 100644 caffe2/python/onnx/onnxifi.py

diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index f4d0c5eafbd69..36a4ae90c3271 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -91,6 +91,10 @@ bool OnnxifiOp<float, CPUContext>::RunOnDevice() {
     tensor_descriptor.dataType = ONNXIFI_DATATYPE_FLOAT32;
     tensor_descriptor.memoryType = ONNXIFI_MEMORY_TYPE_CPU;
     tensor_descriptor.dimensions = tensor_dims.size();
+    CAFFE_ENFORCE(
+        tensor_descriptor.dimensions != 0,
+        tensor_descriptor.name,
+        " has 0 dim");
     output_shapes_.emplace_back(tensor_dims.cbegin(), tensor_dims.cend());
     tensor_descriptor.shape = output_shapes_.back().data();
     tensor_descriptor.buffer =
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h
index 3c5cd2dbc36e6..d2a72d006018a 100644
--- a/caffe2/operators/onnxifi_op.h
+++ b/caffe2/operators/onnxifi_op.h
@@ -73,11 +73,6 @@ class OnnxifiOp final : public Operator<Context> {
     auto weight_descs = BuildInitializationList(
         &mapped_ws, &initializer_set, &weight_names, &weight_shapes);
 
-    ::ONNX_NAMESPACE::ModelProto onnx_model;
-    ParseProtoFromLargeString(onnx_model_str, &onnx_model);
-    onnx_model_str.clear();
-    onnx_model.SerializeToString(&onnx_model_str);
-
     // Build the Onnxifi engine
     // TODO: In spec, backends are hot-pluggable, so two calls to
     // onnxGetBackendIDs may result in different number of backend. And we
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
new file mode 100644
index 0000000000000..c1c6c310786c5
--- /dev/null
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -0,0 +1,375 @@
+#include "caffe2/opt/onnxifi_transformer.h"
+
+#include <iostream>
+#include <unordered_set>
+
+#include <google/protobuf/text_format.h>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/onnx/onnx_exporter.h"
+#include "caffe2/opt/backend_cutting.h"
+
+namespace caffe2 {
+
+namespace {
+
+// TODO(yinghai): Remove the awkward conversion between unordered_map and map
+std::unordered_map<std::string, TensorShape> InferShapes(
+    Workspace* ws,
+    NetDef* pred_net,
+    CaffeMap<std::string, TensorShape>* shape_hints_ordered) {
+  // Populate shapes from workplace
+  const std::vector<std::string>& ws_blobs = ws->Blobs();
+  for (const auto& s : ws_blobs) {
+    auto shape = GetTensorShapeOfBlob(ws->GetBlob(s));
+    if (!shape.unknown_shape()) {
+      shape_hints_ordered->emplace(s, std::move(shape));
+    }
+  }
+
+  std::vector<NetDef*> nets;
+  nets.emplace_back(pred_net);
+  InferBlobShapesAndTypes(*shape_hints_ordered, nets);
+  std::unordered_map<std::string, TensorShape> shape_hints;
+  for (const auto& kv : *shape_hints_ordered) {
+    shape_hints.emplace(kv.first, kv.second);
+  }
+
+  return shape_hints;
+}
+
+void DumpModel(
+    const ::ONNX_NAMESPACE::ModelProto& model,
+    const std::string& fname) {
+  std::ofstream ff(fname);
+  std::string body;
+  ::google::protobuf::TextFormat::PrintToString(model.graph(), &body);
+  ff << body << std::endl;
+  ff.close();
+}
+
+std::vector<::ONNX_NAMESPACE::ValueInfoProto> ConvertToValueInfo(
+    const std::vector<std::string>& names,
+    const std::unordered_map<std::string, TensorShape>& shape_hints) {
+  std::vector<::ONNX_NAMESPACE::ValueInfoProto> r;
+  for (const auto& s : names) {
+    r.emplace_back();
+    auto& value_info = r.back();
+    value_info.set_name(s);
+    const auto it = shape_hints.find(s);
+    if (it == shape_hints.end()) {
+      LOG(WARNING) << "Cannot get shape of " << s;
+    } else {
+      auto* tensor_type = value_info.mutable_type()->mutable_tensor_type();
+      tensor_type->set_elem_type(
+          ::ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT);
+      auto* shape = tensor_type->mutable_shape();
+      for (int i = 0; i < it->second.dims().size(); ++i) {
+        shape->add_dim()->set_dim_value(it->second.dims(i));
+      }
+    }
+  }
+  return r;
+}
+
+void FillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
+  model->set_ir_version(::ONNX_NAMESPACE::Version::IR_VERSION);
+  model->set_producer_name("caffe2");
+  auto* opset_id = model->add_opset_import();
+  opset_id->set_domain("");
+  opset_id->set_version(3);
+}
+} // namespace
+
+OnnxifiTransformer::OnnxifiTransformer(bool debug) : debug_(debug) {
+  lib_ = onnx::initOnnxifiLibrary();
+  CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxGetBackendIDs(nullptr, &num_backends_),
+      ONNXIFI_STATUS_FALLBACK);
+  CAFFE_ENFORCE_GT(
+      num_backends_, 0, "At least 1 onnxifi backend should be available");
+  backend_ids_.resize(num_backends_);
+  CAFFE_ENFORCE_EQ(
+      lib_->onnxGetBackendIDs(backend_ids_.data(), &num_backends_),
+      ONNXIFI_STATUS_SUCCESS);
+}
+
+OperatorDef OnnxifiTransformer::BuildOnnxifiOp(
+    const std::string& onnx_model_str,
+    const std::unordered_map<std::string, std::vector<int>>& output_size_hints,
+    const std::unordered_set<std::string>& initialization_list,
+    const caffe2::NetDef& net) {
+  OperatorDef op;
+  op.set_type("Onnxifi");
+  auto* onnx_model_arg = op.add_arg();
+  onnx_model_arg->set_name("onnx_model");
+  onnx_model_arg->set_s(onnx_model_str);
+
+  // Add the names of the initializer blobs that we want to fetch from the
+  // workspace later
+  auto* initializers_arg = op.add_arg();
+  initializers_arg->set_name("initializers");
+  for (const auto& s : initialization_list) {
+    initializers_arg->add_strings(s);
+    initializers_arg->add_strings(input_mapping_.at(s));
+  }
+
+  // Add the input/output
+  for (const auto& input : net.external_input()) {
+    if (!initialization_list.count(input)) {
+      op.add_input(input);
+    }
+  }
+  for (const auto& output : net.external_output()) {
+    op.add_output(output);
+  }
+
+  // Add output size hints
+  for (int i = 0; i < op.output_size(); ++i) {
+    const auto& o = op.output(i);
+    const auto it = output_size_hints.find(o);
+    if (it != output_size_hints.end()) {
+      const auto& dims = it->second;
+      auto* output_size_hint_arg = op.add_arg();
+      output_size_hint_arg->set_name(MakeString("output_size_hint_", i));
+      for (const auto& d : dims) {
+        output_size_hint_arg->add_ints(d);
+      }
+
+      VLOG(2) << "Adding output hint: " << o;
+    }
+  }
+  return op;
+}
+
+NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
+    const caffe2::NetDef& net,
+    Workspace* ws,
+    onnx::OnnxExporter* exporter,
+    std::unordered_map<std::string, TensorShape>* shape_hints) {
+  ::ONNX_NAMESPACE::ModelProto onnx_model;
+  FillModelInfo(&onnx_model);
+
+  // Convert c2 ops to onnx ops, add const weights if there are any
+  DeviceOption option;
+  CPUContext context(option);
+  context.SwitchToDevice();
+  for (const auto& op : net.op()) {
+    const auto results = exporter->Caffe2OpToOnnxNodes(op, *shape_hints);
+    for (const auto& n : results.first) {
+      onnx_model.mutable_graph()->add_node()->CopyFrom(n);
+    }
+    for (const auto& t : results.second) {
+      VLOG(2) << "Adding extra init tensor: " << t.name();
+      CAFFE_ENFORCE_EQ(
+          t.data_type(),
+          ::ONNX_NAMESPACE::TensorProto::FLOAT,
+          "Only supports conversion of float type for now");
+      TensorShape shape;
+      shape.mutable_dims()->CopyFrom(t.dims());
+      shape_hints->emplace(t.name(), std::move(shape));
+
+      // Feed into workspace as CPU Tensors
+      auto* blob = ws->CreateBlob(t.name());
+      auto* cpu_tensor = blob->GetMutable<TensorCPU>();
+      std::vector<TIndex> dims;
+      std::copy(t.dims().begin(), t.dims().end(), dims.begin());
+      cpu_tensor->Resize(dims);
+      context.template CopyBytes<CPUContext, CPUContext>(
+          cpu_tensor->size() * sizeof(float),
+          static_cast<const void*>(t.raw_data().data()),
+          cpu_tensor->raw_mutable_data(TypeMeta::Make<float>()));
+      context.FinishDeviceComputation();
+
+      // Add mappings
+      CAFFE_ENFORCE(
+          input_mapping_.emplace(t.name(), t.name()).second,
+          MakeString("Tensor ", t.name(), " already exists in the workspace"));
+    }
+  }
+
+  // Convert outputs and compute output shape hints
+  std::vector<std::string> io_names;
+  for (const auto& output : net.external_output()) {
+    io_names.emplace_back(output);
+  }
+  auto io_vec = ConvertToValueInfo(io_names, *shape_hints);
+  std::unordered_map<std::string, std::vector<int>> output_shape_hints;
+  for (const auto& i : io_vec) {
+    onnx_model.mutable_graph()->add_output()->CopyFrom(i);
+    auto ret = output_shape_hints.emplace(i.name(), std::vector<int>());
+    auto& vec = ret.first->second;
+    const auto it = shape_hints->find(i.name());
+    CAFFE_ENFORCE(
+        it != shape_hints->end(),
+        "Cannot find shape info for output ",
+        i.name());
+    const auto& shape = it->second;
+    for (int k = 0; k < shape.dims().size(); ++k) {
+      vec.push_back(shape.dims(k));
+    }
+  }
+
+  // Convert inputs and figure out weights
+  std::unordered_set<std::string> weights;
+  const std::vector<string>& ws_blobs = ws->Blobs();
+  for (const auto& s : ws_blobs) {
+    VLOG(2) << "Add weights: " << s;
+    weights.emplace(s);
+  }
+
+  std::unordered_set<std::string> total_inputs;
+  std::unordered_set<std::string> initialization_list;
+  std::vector<std::string> total_inputs_vec;
+
+  // Extra intermediate weights created during conversion
+  for (const auto& extra_weight : onnx_model.graph().initializer()) {
+    if (total_inputs.emplace(extra_weight.name()).second) {
+      total_inputs_vec.emplace_back(extra_weight.name());
+    }
+  }
+  // Boundary inputs, should not be weights
+  std::unordered_set<std::string> boundary_inputs;
+  for (const auto& i : net.external_input()) {
+    boundary_inputs.emplace(i);
+  }
+
+  for (const auto& op : net.op()) {
+    for (const auto& input : op.input()) {
+      if (total_inputs.emplace(input).second && weights.count(input)) {
+        // We add weights as inputs too
+        total_inputs_vec.emplace_back(input);
+        initialization_list.emplace(input);
+        VLOG(2) << "Add input weights: " << input;
+      } else if (boundary_inputs.count(input)) {
+        VLOG(2) << "Adding boundary input: " << input;
+        total_inputs_vec.emplace_back(input);
+      }
+    }
+  }
+  io_vec = ConvertToValueInfo(total_inputs_vec, *shape_hints);
+  for (const auto& i : io_vec) {
+    onnx_model.mutable_graph()->add_input()->CopyFrom(i);
+  }
+
+  // Debugging stuff
+  if (debug_) {
+    DumpModel(onnx_model, "debug.onnxtxt");
+  }
+
+  // Onnx model is ready. Build ONNXIFI Op
+  std::string model_str;
+  onnx_model.SerializeToString(&model_str);
+  NetDef net_opt;
+  auto* op = net_opt.add_op();
+  *op = BuildOnnxifiOp(model_str, output_shape_hints, initialization_list, net);
+  for (const auto& i : op->input()) {
+    net_opt.add_external_input(i);
+  }
+  for (const auto& i : op->output()) {
+    net_opt.add_external_output(i);
+  }
+
+  return net_opt;
+}
+
+CaffeMap<std::string, TensorShape> OnnxifiTransformer::SsaRewriteAndMapNames(
+    Workspace* ws,
+    NetDef* pred_net,
+    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
+  input_mapping_ = onnx::SsaRewrite(nullptr, pred_net);
+  std::unordered_map<std::string, std::string> input_reverse_mapping;
+  std::vector<std::string> external_inputs;
+  for (const auto kv : input_mapping_) {
+    input_reverse_mapping.emplace(kv.second, kv.first);
+    if (!ws->HasBlob(kv.second)) {
+      external_inputs.emplace_back(kv.first);
+    }
+  }
+  for (const auto& i : external_inputs) {
+    input_mapping_.erase(i);
+  }
+  CaffeMap<std::string, TensorShape> shape_hints_ordered;
+  for (const auto& kv : input_shape_hints) {
+    const auto it = input_reverse_mapping.find(kv.first);
+    if (it != input_reverse_mapping.end()) {
+      LOG(INFO) << "Adding input hint: " << it->second;
+      shape_hints_ordered.emplace(it->second, kv.second);
+    } else {
+      shape_hints_ordered.emplace(kv.first, kv.second);
+    }
+  }
+  return shape_hints_ordered;
+}
+
+// Cutting off the runnable part and replace with ONNXIFI ops. Asssume the nets
+// were topologically sorted
+void OnnxifiTransformer::Transform(
+    Workspace* ws,
+    NetDef* pred_net,
+    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
+  CAFFE_ENFORCE(ws);
+  auto shape_hints_ordered =
+      SsaRewriteAndMapNames(ws, pred_net, input_shape_hints);
+  Workspace mapped_ws(ws, input_mapping_);
+  auto shape_hints = InferShapes(&mapped_ws, pred_net, &shape_hints_ordered);
+
+  CAFFE_ENFORCE(pred_net, "Predict net cannot be nullptr");
+  onnx::OnnxExporter exporter(nullptr, true);
+
+  // function to tell whether the ONNXIFI backend supports a given C2 op or not
+  // TODO: choose backend id
+  auto supports =
+      [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]](
+          const caffe2::OperatorDef& op) {
+        const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
+        // NB: this might not be a hard constraint as we can just export C2
+        // domain specific ops to ONNX
+        if (!schema || schema->onnx_schema().empty()) {
+          LOG(INFO) << "Cannot export c2 op " << op.type()
+                    << " to onnx as there is no corresponding ONNX schema.";
+          return false;
+        }
+
+        ::ONNX_NAMESPACE::ModelProto onnx_model;
+        FillModelInfo(&onnx_model);
+        auto results = exporter.Caffe2OpToOnnxNodes(op, shape_hints);
+        for (const auto& n : results.first) {
+          onnx_model.mutable_graph()->add_node()->CopyFrom(n);
+        }
+        std::string onnx_model_str;
+        onnx_model.SerializeToString(&onnx_model_str);
+        auto ret = backend->onnxGetBackendCompatibility(
+            backend_id, onnx_model_str.size(), onnx_model_str.c_str());
+        if (ret != ONNXIFI_STATUS_SUCCESS) {
+          LOG(INFO) << "Don't support onnx for " << op.type() << " c2 op ("
+                    << ret << ")";
+          return false;
+        } else {
+          return true;
+        }
+      };
+
+  // function to convert runnbale subgraph into a trt op. Note that to keep the
+  // interface clean, we do the double conversion from C2 op to Onnx ops here
+  // but it should be OK as the cost is really small. We also need to keep the
+  // same exporter throughout the process to avoid duplicated dummy name
+  // generation
+  onnx::OnnxExporter exporter2(nullptr, true);
+  auto trt_converter = [this, &mapped_ws, &shape_hints, &exporter2](
+                           const caffe2::NetDef& net) mutable {
+    return SubnetToOnnxifiOp(net, &mapped_ws, &exporter2, &shape_hints);
+  };
+
+  NetDef net_opt = opt::OptimizeForBackend(*pred_net, supports, trt_converter);
+
+  // Need to figure out a proper place to handle device option
+  net_opt.mutable_device_option()->CopyFrom(pred_net->device_option());
+  pred_net->Swap(&net_opt);
+}
+
+} // namespace caffe2
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
new file mode 100644
index 0000000000000..197a026e3a8b9
--- /dev/null
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "onnx/onnx_pb.h"
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/workspace.h"
+#include "caffe2/onnx/onnxifi_init.h"
+#include "caffe2/proto/caffe2.pb.h"
+
+namespace caffe2 {
+namespace onnx {
+class OnnxExporter;
+}
+
+class OnnxifiTransformer {
+ public:
+  explicit OnnxifiTransformer(bool debug);
+
+  void Transform(
+      Workspace* ws,
+      NetDef* pred_net,
+      const std::unordered_map<std::string, TensorShape>& shape_hints);
+
+ private:
+  caffe2::NetDef SubnetToOnnxifiOp(
+      const caffe2::NetDef& net,
+      Workspace* ws,
+      onnx::OnnxExporter* exporter,
+      std::unordered_map<std::string, TensorShape>* shape_hints);
+
+  OperatorDef BuildOnnxifiOp(
+      const std::string& onnx_model_str,
+      const std::unordered_map<std::string, std::vector<int>>&
+          output_size_hints,
+      const std::unordered_set<std::string>& initialization_list,
+      const caffe2::NetDef& net);
+
+  CaffeMap<std::string, TensorShape> SsaRewriteAndMapNames(
+      Workspace* ws,
+      NetDef* pred_net,
+      const std::unordered_map<std::string, TensorShape>& input_shape_hints);
+
+  // Dump onnx model for debugging
+  bool debug_{false};
+
+  // Pointer to loaded onnxifi library
+  onnxifi_library* lib_{nullptr};
+
+  // Number of backends
+  size_t num_backends_{0};
+
+  // Backned IDs
+  std::vector<onnxBackendID> backend_ids_;
+  // Input mapping
+  std::unordered_map<std::string, std::string> input_mapping_;
+};
+} // namespace caffe2
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
new file mode 100644
index 0000000000000..8a132698e76de
--- /dev/null
+++ b/caffe2/python/onnx/onnxifi.py
@@ -0,0 +1,61 @@
+## @package onnx
+#Module caffe2.python.onnx.onnxifi
+
+"""
+ONNXIFI a Caffe2 net
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+import caffe2.python._import_c_extension as C
+import numpy as np
+
+
+def _infer_shapes(pred_net, inputs):
+    workspace.RunNetOnce(pred_net)
+    hints = {}
+    for op in pred_net.op:
+        for o in op.output:
+            if o not in hints:
+                blob = workspace.FetchBlob(o)
+                if hasattr(blob, 'shape'):
+                    hints[o] = blob.shape
+        for i in op.input:
+            if i not in hints:
+                blob = workspace.FetchBlob(i)
+                if hasattr(blob, 'shape'):
+                    hints[i] = blob.shape
+
+    return hints
+
+
+def onnxifi_caffe2_net(
+        pred_net,
+        input_shapes,
+        populate_shapes=False,
+        debug=False):
+    """
+    Transfrom the caffe2_net by collapsing ONNXIFI-runnable nodes into Onnxifi c2 ops
+    """
+    # Hacky way to infer shapes as not all our operators have shape inference function.
+    # Normally this is not needed
+    shape_hints = {}
+    if populate_shapes:
+        input_data = {}
+        for k, v in input_shapes.items():
+            input_data[k] = np.random.randn(*v).astype(np.float32)
+        shape_hints = _infer_shapes(pred_net, input_data)
+
+    for k, v in input_shapes.items():
+        shape_hints[k] = v
+    pred_net_str = C.onnxifi(pred_net.SerializeToString(),
+                                       shape_hints,
+                                       debug)
+    pred_net_cut = caffe2_pb2.NetDef()
+    pred_net_cut.ParseFromString(pred_net_str)
+    return pred_net_cut
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
index 002287cf3b839..596dcde044d5f 100644
--- a/caffe2/python/onnx/test_onnxifi.py
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -3,16 +3,36 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import json
 import numpy as np
+import os
+import time
 import unittest
 
 import onnx
 import onnx.defs
+from onnx.backend.base import namedtupledict
 from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
+from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
+from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
 from caffe2.python.onnx.tests.test_utils import TestCase
 
+
+def _print_net(net):
+    for i in net.external_input:
+        print("Input: {}".format(i))
+    for i in net.external_output:
+        print("Output: {}".format(i))
+    for op in net.op:
+        print("Op {}".format(op.type))
+        for x in op.input:
+            print("  input: {}".format(x))
+        for y in op.output:
+            print("  output: {}".format(y))
+
+
 class OnnxifiTest(TestCase):
     @unittest.skip("Need ONNXIFI backend support")
     def test_relu_graph(self):
@@ -80,3 +100,139 @@ def test_conv_graph(self):
         np.testing.assert_almost_equal(Y, Y_without_padding)
 
 
+class OnnxifiTransformTest(TestCase):
+    def _model_dir(self, model):
+        caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2'))
+        models_dir = os.getenv('CAFFE2_MODELS', os.path.join(caffe2_home, 'models'))
+        return os.path.join(models_dir, model)
+
+    def _download(self, model):
+        model_dir = self._model_dir(model)
+        assert not os.path.exists(model_dir)
+        os.makedirs(model_dir)
+        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
+            url = getURLFromName(model, f)
+            dest = os.path.join(model_dir, f)
+            try:
+                try:
+                    downloadFromURLToFile(url, dest,
+                                          show_progress=False)
+                except TypeError:
+                    # show_progress not supported prior to
+                    # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
+                    # (Sep 17, 2017)
+                    downloadFromURLToFile(url, dest)
+            except Exception as e:
+                print("Abort: {reason}".format(reason=e))
+                print("Cleaning up...")
+                deleteDirectory(model_dir)
+                exit(1)
+
+    # TODO: we need to modulize this function
+    def _get_c2_model(self, model_name):
+        model_dir = self._model_dir(model_name)
+        if not os.path.exists(model_dir):
+            self._download(model_name)
+        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
+        c2_predict_net = caffe2_pb2.NetDef()
+        with open(c2_predict_pb, 'rb') as f:
+            c2_predict_net.ParseFromString(f.read())
+        c2_predict_net.name = model_name
+
+        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
+        c2_init_net = caffe2_pb2.NetDef()
+        with open(c2_init_pb, 'rb') as f:
+            c2_init_net.ParseFromString(f.read())
+        c2_init_net.name = model_name + '_init'
+
+        value_info = json.load(open(os.path.join(model_dir, 'value_info.json')))
+        return c2_init_net, c2_predict_net, value_info
+
+    def _add_head_tail(self, pred_net, new_head, new_tail):
+        orig_head = pred_net.external_input[0]
+        orig_tail = pred_net.external_output[0]
+
+        # Add head
+        head = caffe2_pb2.OperatorDef()
+        head.type = "Copy"
+        head.input.append(new_head)
+        head.output.append(orig_head)
+        dummy = caffe2_pb2.NetDef()
+        dummy.op.extend(pred_net.op)
+        del pred_net.op[:]
+        pred_net.op.extend([head])
+        pred_net.op.extend(dummy.op)
+        pred_net.external_input[0] = new_head
+
+        # Add tail
+        tail = caffe2_pb2.OperatorDef()
+        tail.type = "Copy"
+        tail.input.append(orig_tail)
+        tail.output.append(new_tail)
+        pred_net.op.extend([tail])
+        pred_net.external_output[0] = new_tail
+
+    @unittest.skip("Need ONNXIFI backend support")
+    def test_resnet50_core(self):
+        N = 1
+        repeat = 1
+        print("Batch size: {}, repeat inference {} times".format(N, repeat))
+        init_net, pred_net, _ = self._get_c2_model('resnet50')
+        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
+        input_blob_dims = (N, 3, 224, 224)
+        input_name = "real_data"
+
+        device_option = core.DeviceOption(caffe2_pb2.CPU, 0)
+        init_net.device_option.CopyFrom(device_option)
+        pred_net.device_option.CopyFrom(device_option)
+        for op in pred_net.op:
+            op.device_option.CopyFrom(device_option)
+        net_outputs = pred_net.external_output
+        Y_c2 = None
+        data = np.random.randn(*input_blob_dims).astype(np.float32)
+        c2_time = 1
+        workspace.SwitchWorkspace("onnxifi_test", True)
+        with core.DeviceScope(device_option):
+            workspace.FeedBlob(input_name, data)
+            workspace.RunNetOnce(init_net)
+            workspace.CreateNet(pred_net)
+            start = time.time()
+            for _ in range(repeat):
+                workspace.RunNet(pred_net.name)
+            end = time.time()
+            c2_time = end - start
+            output_values = [workspace.FetchBlob(name) for name in net_outputs]
+            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
+        workspace.ResetWorkspace()
+
+        # Fill the workspace with the weights
+        with core.DeviceScope(device_option):
+            workspace.RunNetOnce(init_net)
+
+        # Cut the graph
+        start = time.time()
+        pred_net_cut = onnxifi_caffe2_net(pred_net,
+                                          {input_name: input_blob_dims})
+        del init_net, pred_net
+        #_print_net(pred_net_cut)
+
+        Y_trt = None
+        input_name = pred_net_cut.external_input[0]
+        print("C2 runtime: {}s".format(c2_time))
+        with core.DeviceScope(device_option):
+            workspace.FeedBlob(input_name, data)
+            workspace.CreateNet(pred_net_cut)
+            end = time.time()
+            print("Conversion time: {:.2f}s".format(end - start))
+
+            start = time.time()
+            for _ in range(repeat):
+                workspace.RunNet(pred_net_cut.name)
+            end = time.time()
+            trt_time = end - start
+            print("Onnxifi runtime: {}s, improvement: {}%".format(trt_time, (c2_time - trt_time) / c2_time * 100))
+            output_values = [workspace.FetchBlob(name) for name in net_outputs]
+            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
+        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
+
+
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index dca1cdf0fbf2f..48b316c771d85 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -24,6 +24,7 @@
 #include "caffe2/opt/converter.h"
 #include "caffe2/opt/fusion.h"
 #include "caffe2/opt/mobile.h"
+#include "caffe2/opt/onnxifi_transformer.h"
 #include "caffe2/opt/optimize_ideep.h"
 #include "caffe2/opt/passes.h"
 #include "caffe2/opt/sink.h"
@@ -1555,6 +1556,27 @@ void addGlobalMethods(py::module& m) {
     new_proto.SerializeToString(&out);
     return py::bytes(out);
   });
+  m.def(
+      "onnxifi",
+      [](const py::bytes& pred_net_str,
+         const std::unordered_map<std::string, std::vector<int>>& shapes,
+         bool debug_builder) -> py::bytes {
+        caffe2::NetDef pred_net;
+        CAFFE_ENFORCE(
+            ParseProtoFromLargeString(
+                pred_net_str.cast<std::string>(), &pred_net),
+            "broken pred_net protobuf");
+        std::unordered_map<std::string, TensorShape> tensor_shapes;
+        for (const auto& it : shapes) {
+          tensor_shapes.emplace(
+              it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
+        }
+        OnnxifiTransformer ts(debug_builder);
+        ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
+        std::string pred_net_str2;
+        pred_net.SerializeToString(&pred_net_str2);
+        return py::bytes(pred_net_str2);
+      });
   m.def(
       "run_workspace_transform",
       [](const std::string& transform_name, py::bytes def) {
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index 128d303a8f44a..fbc6c11026fba 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -53,23 +53,26 @@ def convert_onnx_model_to_trt_op(onnx_model,
     op.ParseFromString(trt_str)
     return op
 
-def _infer_shapes(init_net, pred_net, inputs):
-    ws, outputs = c2_native_run_net(init_net, pred_net, inputs)
+
+# Assume the workspace is already filled with init weights
+def _infer_shapes(pred_net, inputs):
+    workspace.RunNetOnce(pred_net)
     hints = {}
     for op in pred_net.op:
         for o in op.output:
             if o not in hints:
-                blob = ws.FetchBlob(o)
+                blob = workspace.FetchBlob(o)
                 if hasattr(blob, 'shape'):
                     hints[o] = blob.shape
         for i in op.input:
             if i not in hints:
-                blob = ws.FetchBlob(i)
+                blob = workspace.FetchBlob(i)
                 if hasattr(blob, 'shape'):
                     hints[i] = blob.shape
 
     return hints
 
+
 def transform_caffe2_net(
         pred_net,
         input_shapes,
@@ -91,7 +94,7 @@ def transform_caffe2_net(
         input_data = {}
         for k,v in input_shapes.items():
             input_data[k] = np.random.randn(*v).astype(np.float32)
-        shape_hints = _infer_shapes(init_net, pred_net, input_data)
+        shape_hints = _infer_shapes(pred_net, input_data)
 
     for k,v in input_shapes.items():
         shape_hints[k] = v

From 3efdece9daade24630c72ebb7b17502134995196 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Fri, 20 Jul 2018 15:37:22 -0700
Subject: [PATCH 13/23] Support n-dimensional empty tensors in take/put.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9635

Differential Revision: D8935119

Pulled By: gchanan

fbshipit-source-id: 5035583e7322b1a1720d961945dd0eefb4cb28ef
---
 aten/src/THC/THCTensorTypeUtils.cuh    |  2 +-
 aten/src/THC/generic/THCTensorIndex.cu | 20 ++++++++------------
 test/test_torch.py                     | 22 ++++++++++++++++++++++
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
index 6ff6d680edbde..99266b3b4b931 100644
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -60,7 +60,7 @@ getTensorInfo(THCState* state, TensorType* t) {
   IndexType sz[MAX_CUTORCH_DIMS];
   IndexType st[MAX_CUTORCH_DIMS];
 
-  int dims = THCTensor__nDimension(state, t);
+  int dims = THCTensor_nDimension(state, t);
   for (int i = 0; i < dims; ++i) {
     sz[i] = THCTensor_size(state, t, i);
     st[i] = THCTensor_stride(state, t, i);
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index f93ad4dfe14dd..1d934595aabd2 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -218,18 +218,14 @@ void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLon
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
   THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
 
-  THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2,
+  THArgCheck(THCTensor_(nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCTensor_(nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCudaLongTensor_nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(!(THCTensor_(numel)(state, src) == 0 && THCudaLongTensor_numel(state, index) != 0), 2,
              "tried to take from an empty tensor");
 
   THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL);
-
-  // dispatchTakePut only handles non-empty tensors;
-  if (index->_dim() > 0) {
-    dispatchTakePut<real, TensorTakeOp>(state, src, dst, index);
-  }
+  dispatchTakePut<real, TensorTakeOp>(state, src, dst, index);
 }
 
 static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) {
@@ -255,9 +251,9 @@ void THCTensor_(put)(THCState *state, THCTensor *dst, THCudaLongTensor *index, T
   THArgCheck(THCTensor_(nElement)(state, src) == numIndices,
     3, "src should have the same number of elements as index");
 
-  THArgCheck(THCTensor_(_nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCTensor_(_nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCudaLongTensor__nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCTensor_(nDimension)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCTensor_(nDimension)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
+  THArgCheck(THCudaLongTensor_nDimension(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
 
   if (numIndices == 0) {
     return;
diff --git a/test/test_torch.py b/test/test_torch.py
index d1370e30cb826..7148ab69f82ec 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5661,6 +5661,16 @@ def check(src, idx):
         check(src, idx)
         check(src.transpose(1, 2), idx)
 
+    @skipIfNoZeroSize
+    def test_take_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            for input_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]:
+                for indices_shape in [(0,), (0, 1, 2, 0)]:
+                    input = torch.empty(input_shape, device=device)
+                    indices = torch.empty(indices_shape, dtype=torch.int64, device=device)
+                    self.assertEqual(indices, torch.take(input, indices))
+
     def test_put_(self):
         def check(dst, idx, value):
             expected = dst.clone().view(-1).index_copy_(
@@ -5682,6 +5692,18 @@ def test_put_accumulate(self):
         dst.put_(idx, src, accumulate=True)
         self.assertEqual(dst.tolist(), [[5, 7], [1, 1]])
 
+    @skipIfNoZeroSize
+    def test_put_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            for dst_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]:
+                for indices_shape in [(0,), (0, 1, 2, 0)]:
+                    for accumulate in [False, True]:
+                        dst = torch.randn(dst_shape, device=device)
+                        indices = torch.empty(indices_shape, dtype=torch.int64, device=device)
+                        src = torch.randn(indices_shape, device=device)
+                        self.assertEqual(dst, dst.put_(indices, src, accumulate=accumulate))
+
     # Fill idx with valid indices.
     @staticmethod
     def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o):

From 5e84403d5fc92ad2ed594b86a436c571dac55d30 Mon Sep 17 00:00:00 2001
From: Ashish <ashish.farmer@amd.com>
Date: Fri, 20 Jul 2018 17:10:20 -0700
Subject: [PATCH 14/23] Fix for half conversion for ROCm 1.8.2 (#9663)

Summary:
This PR contains the change for explicit conversion between ushort and __half required for ROCm 1.8.2 support
bddppq
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9663

Differential Revision: D8943937

Pulled By: bddppq

fbshipit-source-id: 16102f9dbc68ed4ece2e8fc244825c3992c24901
---
 caffe2/utils/conversions.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h
index fbe30fbadac90..0b63b47ade393 100644
--- a/caffe2/utils/conversions.h
+++ b/caffe2/utils/conversions.h
@@ -187,7 +187,7 @@ CONVERSIONS_DECL float16 To(const float in) {
 #endif // CUDA_VERSION >= 9000
 #elif __HIP_DEVICE_COMPILE__
   float16 ret;
-  ret.x = __float2half(in);
+  ret.x = __half_as_ushort(__float2half(in));
   return ret;
 #else
   return cpu_float2half_rn(in);
@@ -206,7 +206,7 @@ CONVERSIONS_DECL float To(const float16 in) {
   return __half2float(tmp);
 #elif __HIP_DEVICE_COMPILE__
   __half tmp;
-  tmp = in.x;
+  tmp = __ushort_as_half(in.x);
   return __half2float(tmp);
 #else
   return cpu_half2float(in);

From 23ed26a0c399c121fc1bcb5824dff33d8eb463b4 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 20 Jul 2018 19:26:17 -0700
Subject: [PATCH 15/23] Guard include of cuda-only header comm.h (#9656)

Summary:
Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9656

Reviewed By: colesbury

Differential Revision: D8941361

Pulled By: ezyang

fbshipit-source-id: c18cb0e606ae0608e5892040192b8792ae542b74
---
 test/cpp/api/parallel.cpp                                | 4 ++++
 torch/csrc/api/include/torch/nn/parallel/data_parallel.h | 4 ++++
 torch/csrc/autograd/functions/comm.cpp                   | 4 ++++
 torch/csrc/autograd/functions/comm.h                     | 4 ++++
 torch/csrc/cuda/comm.cpp                                 | 4 ++++
 torch/csrc/cuda/comm.h                                   | 4 ++++
 6 files changed, 24 insertions(+)

diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index 2d8f413c05300..1c11e87a533ef 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -17,6 +17,8 @@ using Catch::StartsWith;
 using namespace torch::autograd;
 using namespace torch::nn;
 
+#ifdef USE_CUDA
+
 TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
   Scatter scatter(
       {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
@@ -228,3 +230,5 @@ TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
     REQUIRE(output[i].toCInt() == i);
   }
 }
+
+#endif
diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
index 82150658dcffe..09df4b2b1e441 100644
--- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -161,6 +161,7 @@ Tensor data_parallel(
     return module->forward(std::move(input)).to(*output_device);
   }
 
+#ifdef USE_CUDA
   autograd::Scatter scatter(*devices, /*chunk_sizes=*/at::nullopt, dim);
   auto scattered_inputs = scatter.apply({std::move(input)});
 
@@ -169,6 +170,9 @@ Tensor data_parallel(
   return autograd::Gather(*output_device, dim)
       .apply(std::move(outputs))
       .front();
+#else
+  AT_ERROR("data_parallel not supported without CUDA");
+#endif
 }
 
 } // namespace parallel
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
index 052aba90dcc4f..291a4205df16f 100644
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -1,3 +1,5 @@
+#ifdef USE_CUDA
+
 #include <torch/csrc/autograd/functions/comm.h>
 
 #include <torch/csrc/autograd/function.h>
@@ -129,3 +131,5 @@ variable_list Gather::apply(variable_list&& inputs) {
 
 } // namespace autograd
 } // namespace torch
+
+#endif
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
index a7c24748ca01f..b1a4b2bc3d031 100644
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#ifdef USE_CUDA
+
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/variable.h>
 
@@ -39,3 +41,5 @@ struct Gather : public Function {
 
 } // namespace autograd
 } // namespace torch
+
+#endif
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 9c3b8a305a5ae..0e869876e8e1f 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/cuda/comm.h>
 
+#ifdef USE_CUDA
+
 #include <torch/csrc/cuda/device_set.h>
 #include <torch/csrc/utils/tensor_flatten.h>
 
@@ -199,3 +201,5 @@ at::Tensor gather(
   return result;
 }
 }} // namespace torch::cuda
+
+#endif
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h
index caf7811afd1ae..b6d5c9b66fd75 100644
--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#ifdef USE_CUDA
+
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -27,3 +29,5 @@ at::Tensor gather(
     int64_t dim,
     at::optional<int32_t> destination_index);
 }}
+
+#endif

From b5c8d59451be8ae2e892afb28d0dc39661c666f1 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 20 Jul 2018 20:35:20 -0700
Subject: [PATCH 16/23] Add a CUDAContext header include

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9662

Differential Revision: D8945581

Pulled By: ezyang

fbshipit-source-id: 2fe0adc96456788579f7d6f1c4513fe45360c030
---
 torch/csrc/autograd/functions/comm.cpp | 1 +
 torch/csrc/autograd/functions/comm.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
index 291a4205df16f..0af03cbfa39f5 100644
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -10,6 +10,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <cstddef>
 #include <memory>
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
index b1a4b2bc3d031..df75799a5ef84 100644
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -6,6 +6,7 @@
 #include <torch/csrc/autograd/variable.h>
 
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 
 #include <cstddef>
 #include <vector>

From 3bb8c5eab1029a7de971b7368e43be06ef97182d Mon Sep 17 00:00:00 2001
From: Owen Anderson <owen.anderson@oculus.com>
Date: Fri, 20 Jul 2018 22:22:08 -0700
Subject: [PATCH 17/23] Allow MKLDNN on macOS, and any other OS where CMake is
 able to detect it.

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9638

Reviewed By: soumith

Differential Revision: D8946130

Pulled By: resistor

fbshipit-source-id: 87bd9cb12608467b05bd4998fdb00bfdbd038ca2
---
 tools/setup_helpers/mkldnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/setup_helpers/mkldnn.py b/tools/setup_helpers/mkldnn.py
index cc230e5550b45..47a10d04785c1 100644
--- a/tools/setup_helpers/mkldnn.py
+++ b/tools/setup_helpers/mkldnn.py
@@ -16,7 +16,7 @@ def gather_paths(env_vars):
 MKLDNN_LIB_DIR = None
 MKLDNN_INCLUDE_DIR = None
 MKLDNN_LIBRARY = None
-if (IS_LINUX or IS_WINDOWS) and not check_env_flag('NO_MKLDNN'):
+if not check_env_flag('NO_MKLDNN'):
     lib_paths = list(filter(bool, [
         os.getenv('MKLDNN_LIB_DIR'),
         os.path.join(MKLDNN_HOME, 'lib'),

From a01d6f01b5547a396d6e28f95538cd061e14d29a Mon Sep 17 00:00:00 2001
From: Xiaomeng Yang <yangxm@fb.com>
Date: Sat, 21 Jul 2018 12:40:40 -0700
Subject: [PATCH 18/23] Update channel_shuffle_op and transpose 2d to speed up
 ShuffleNet (#9525)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9525

Update channel_shuffle_op and transpose 2d to speed up ShuffleNet

Reviewed By: houseroad

Differential Revision: D8889361

fbshipit-source-id: 60196e819b6842becc53b4859b62d4419a0e2c6e
---
 .../operators/operator_fallback_ideep.cc      |   2 +-
 caffe2/mkl/operators/operator_fallback_mkl.cc |   2 +-
 caffe2/operators/channel_shuffle_op.cc        | 194 ++++++++++++++++--
 caffe2/operators/channel_shuffle_op.h         | 156 +++++---------
 caffe2/operators/channel_shuffle_op_gpu.cu    |  13 +-
 .../operator_test/channel_shuffle_test.py     |  71 ++++---
 .../operator_test/group_norm_op_test.py       |   2 +-
 caffe2/utils/math_cpu.cc                      | 112 ++++++++--
 8 files changed, 379 insertions(+), 173 deletions(-)

diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index d222c6958067c..0d8b6fd55b205 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -43,7 +43,7 @@ struct SigmoidCPUFunctor {
 REGISTER_IDEEP_OPERATOR(Softmax, IDEEPFallbackOp<SoftmaxOp<float, CPUContext>>);
 REGISTER_IDEEP_OPERATOR(
     ChannelShuffle,
-    IDEEPFallbackOp<ChannelShuffleOp<CPUContext>>);
+    IDEEPFallbackOp<ChannelShuffleOp<float, CPUContext>>);
 REGISTER_IDEEP_OPERATOR(
     LabelCrossEntropy,
     IDEEPFallbackOp<LabelCrossEntropyOp<float, CPUContext>>);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.cc b/caffe2/mkl/operators/operator_fallback_mkl.cc
index 106fa05dec70a..56e2bc687bb68 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.cc
+++ b/caffe2/mkl/operators/operator_fallback_mkl.cc
@@ -76,7 +76,7 @@ REGISTER_MKL_OPERATOR(
     mkl::MKLFallbackOp<ElementwiseLinearOp<float, CPUContext>>);
 REGISTER_MKL_OPERATOR(
     ChannelShuffle,
-    mkl::MKLFallbackOp<ChannelShuffleOp<CPUContext>>);
+    mkl::MKLFallbackOp<ChannelShuffleOp<float, CPUContext>>);
 REGISTER_MKL_OPERATOR(
     NCHW2NHWC,
     mkl::MKLFallbackOp<NCHW2NHWCOp<float, CPUContext>>);
diff --git a/caffe2/operators/channel_shuffle_op.cc b/caffe2/operators/channel_shuffle_op.cc
index 67bdbfb3f888b..31d34b01fe2e7 100644
--- a/caffe2/operators/channel_shuffle_op.cc
+++ b/caffe2/operators/channel_shuffle_op.cc
@@ -1,23 +1,175 @@
 #include "channel_shuffle_op.h"
 
+#include <array>
+#include <string>
+#include <vector>
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
 namespace caffe2 {
 
-class GetChannelShuffleGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        def_.type() + "Gradient",
-        "",
-        vector<string>{GO(0)},
-        vector<string>{GI(0)});
+namespace {
+
+template <typename T>
+void RunChannelShuffleNCHW(
+    const int N,
+    const int G,
+    const int K,
+    const int HxW,
+    const T* X,
+    T* Y) {
+  const int stride = G * K * HxW;
+  for (int i = 0; i < N; ++i) {
+    if (G < K) {
+      EigenMatrixMap<T> Y_mat(Y, G * HxW, K);
+      for (int j = 0; j < G; ++j) {
+        Y_mat.block(j * HxW, 0, HxW, K) =
+            ConstEigenMatrixMap<T>(X + j * K * HxW, HxW, K);
+      }
+    } else {
+      ConstEigenMatrixMap<T> X_mat(X, K * HxW, G);
+      for (int j = 0; j < K; ++j) {
+        EigenMatrixMap<float>(Y + j * G * HxW, HxW, G) =
+            X_mat.block(j * HxW, 0, HxW, G);
+      }
+    }
+    X += stride;
+    Y += stride;
   }
-};
+}
+
+template <typename T>
+void RunChannelShuffleNHWC(
+    const int N,
+    const int G,
+    const int K,
+    const int HxW,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  const std::array<int, 2> dims = {G, K};
+  const std::array<int, 2> axes = {1, 0};
+  const int M = N * HxW;
+  const int C = G * K;
+  for (int i = 0; i < M; ++i) {
+    math::Transpose<T, CPUContext>(2, dims.data(), axes.data(), X, Y, context);
+    X += C;
+    Y += C;
+  }
+}
+
+#ifdef CAFFE2_USE_MKL
+
+template <>
+void RunChannelShuffleNCHW<float>(
+    const int N,
+    const int G,
+    const int K,
+    const int HxW,
+    const float* X,
+    float* Y) {
+  const int stride = G * K * HxW;
+  for (int i = 0; i < N; ++i) {
+    if (G < K) {
+      for (int j = 0; j < G; ++j) {
+        mkl_somatcopy(
+            'R', 'N', K, HxW, 1.0f, X + j * K * HxW, HxW, Y + j * HxW, G * HxW);
+      }
+    } else {
+      for (int j = 0; j < K; ++j) {
+        mkl_somatcopy(
+            'R', 'N', G, HxW, 1.0f, X + j * HxW, K * HxW, Y + j * G * HxW, HxW);
+      }
+    }
+    X += stride;
+    Y += stride;
+  }
+}
+
+#endif // CAFFE2_USE_MKL
+
+} // namespace
+
+template <>
+bool ChannelShuffleOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
+  const int G = group_;
+  CAFFE_ENFORCE_EQ(C % G, 0);
+  const int K = C / G;
+  const int HxW = X.size() / (N * C);
+  const float* X_data = X.data<float>();
+  float* Y_data = Y->mutable_data<float>();
+  RunChannelShuffleNCHW<float>(N, G, K, HxW, X_data, Y_data);
+  return true;
+} // namespace caffe2
 
-REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<CPUContext>);
+template <>
+bool ChannelShuffleOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& X = Input(0);
+  auto* Y = Output(0);
+  Y->ResizeLike(X);
+  const int ndim = X.ndim();
+  const int N = X.dim32(0);
+  const int C = X.dim32(ndim - 1);
+  const int G = group_;
+  CAFFE_ENFORCE_EQ(C % G, 0);
+  const int K = C / G;
+  const int HxW = X.size() / (N * C);
+  const float* X_data = X.data<float>();
+  float* Y_data = Y->mutable_data<float>();
+  RunChannelShuffleNHWC<float>(N, G, K, HxW, X_data, Y_data, &context_);
+  return true;
+}
+
+template <>
+bool ChannelShuffleGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int N = dY.dim32(0);
+  const int C = dY.dim32(1);
+  const int G = group_;
+  CAFFE_ENFORCE_EQ(C % G, 0);
+  const int K = C / G;
+  const int HxW = dY.size() / (N * C);
+  const float* dY_data = dY.data<float>();
+  float* dX_data = dX->mutable_data<float>();
+  RunChannelShuffleNCHW<float>(N, K, G, HxW, dY_data, dX_data);
+  return true;
+}
+
+template <>
+bool ChannelShuffleGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
+  const auto& dY = Input(0);
+  auto* dX = Output(0);
+  dX->ResizeLike(dY);
+  const int ndim = dY.ndim();
+  const int N = dY.dim32(0);
+  const int C = dY.dim32(ndim - 1);
+  const int G = group_;
+  CAFFE_ENFORCE_EQ(C % G, 0);
+  const int K = C / G;
+  const int HxW = dY.size() / (N * C);
+  const float* dY_data = dY.data<float>();
+  float* dX_data = dX->mutable_data<float>();
+  RunChannelShuffleNHWC<float>(N, K, G, HxW, dY_data, dX_data, &context_);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ChannelShuffle, ChannelShuffleOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(
     ChannelShuffleGradient,
-    ChannelShuffleGradientOp<CPUContext>);
-REGISTER_GRADIENT(ChannelShuffle, GetChannelShuffleGradient);
+    ChannelShuffleGradientOp<float, CPUContext>);
+
 OPERATOR_SCHEMA(ChannelShuffle)
     .IdenticalTypeAndShape()
     .NumInputs(1)
@@ -27,4 +179,22 @@ OPERATOR_SCHEMA(ChannelShuffleGradient)
     .IdenticalTypeAndShape()
     .NumInputs(1)
     .NumOutputs(1);
+
+namespace {
+
+class GetChannelShuffleGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ChannelShuffleGradient",
+        "",
+        std::vector<std::string>{GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(ChannelShuffle, GetChannelShuffleGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/channel_shuffle_op.h b/caffe2/operators/channel_shuffle_op.h
index 05e66bf5f78c6..87b4e49530345 100644
--- a/caffe2/operators/channel_shuffle_op.h
+++ b/caffe2/operators/channel_shuffle_op.h
@@ -1,124 +1,66 @@
-#pragma once
-#include "caffe2/utils/math.h"
-#include "caffe2/operators/conv_pool_op_base.h"
+#ifndef CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_
+#define CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
 
 namespace caffe2 {
 
-template <typename Context>
-class ChannelShuffleOp final : public ConvPoolOpBase<Context> {
+template <typename T, class Context>
+class ChannelShuffleOp final : public Operator<Context> {
  public:
-  USE_OPERATOR_FUNCTIONS(Context);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
   ChannelShuffleOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<Context>(operator_def, ws) {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& X = Input(0);
-    auto* Y = Output(0);
-    Y->ResizeLike(X);
-    const auto C = X.dim32(1);
-    const auto G = this->group_;
-    CAFFE_ENFORCE(C % G == 0, "");
-    const auto K = C / G;
-    const auto S = X.dim32(2) * X.dim32(3);
-    for (auto n = 0; n < X.dim32(0); ++n) {
-      for (auto g = 0; g < G; ++g) {
-        // Scatter the group g block (of size KxS) to output channels
-        // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
-        math::CopyMatrix<Context>(
-            X.itemsize(),
-            K,
-            S,
-            X.template data<float>() + g * K * S + n * C * S,
-            S,
-            Y->template mutable_data<float>() + g * S + n * C * S,
-            G * S,
-            &context_,
-            X.meta().copy());
-      }
-    }
-    return true;
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
+        OP_SINGLE_ARG(int, "group", group_, 1) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
   }
 
-  bool RunOnDeviceWithOrderNHWC() override {
-    const auto& X = Input(0);
-    auto* Y = Output(0);
-    Y->ResizeLike(X);
-    const auto C = X.dim32(3);
-    const auto G = this->group_;
-    CAFFE_ENFORCE(C % G == 0, "");
-    const auto K = C / G;
-    std::array<int, 2> dims = {G, K};
-    std::array<int, 2> axes = {1, 0};
-    for (auto i = 0; i < X.size(); i += C) {
-      // Transpose each C = GxK matrix
-      math::Transpose(
-          2,
-          dims.data(),
-          axes.data(),
-          X.template data<float>() + i,
-          Y->template mutable_data<float>() + i,
-          &context_);
-    }
-    return true;
+  bool RunOnDevice() override {
+    return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
+                                        : RunOnDeviceWithOrderNHWC();
   }
+
+  bool RunOnDeviceWithOrderNCHW();
+
+  bool RunOnDeviceWithOrderNHWC();
+
+ private:
+  const StorageOrder order_;
+  const int group_;
 };
 
-template <typename Context>
-class ChannelShuffleGradientOp final : public ConvPoolOpBase<Context> {
+template <typename T, class Context>
+class ChannelShuffleGradientOp final : public Operator<Context> {
  public:
-  USE_OPERATOR_FUNCTIONS(Context);
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
   ChannelShuffleGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<Context>(operator_def, ws) {}
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    const auto& dY = Input(0);
-    auto* dX = Output(0);
-    dX->ResizeLike(dY);
-    const auto C = dY.dim32(1);
-    const auto G = this->group_;
-    CAFFE_ENFORCE(C % G == 0, "");
-    const auto K = C / G;
-    const auto S = dY.dim32(2) * dY.dim32(3);
-    for (auto n = 0; n < dY.dim32(0); ++n) {
-      for (auto g = 0; g < G; ++g) {
-        // Gather the group g block (of size KxS) from output channels
-        // g + 0 * G, g + 1 * G, g + 2 * G, g + G * (K - 1) etc.
-        math::CopyMatrix<Context>(
-            dY.itemsize(),
-            K,
-            S,
-            dY.template data<float>() + g * S + n * C * S,
-            G * S,
-            dX->template mutable_data<float>() + g * K * S + n * C * S,
-            S,
-            &context_,
-            dY.meta().copy());
-      }
-    }
-    return true;
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
+        OP_SINGLE_ARG(int, "group", group_, 1) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
   }
 
-  bool RunOnDeviceWithOrderNHWC() override {
-    const auto& dY = Input(0);
-    auto* dX = Output(0);
-    dX->ResizeLike(dY);
-    const auto C = dY.dim32(3);
-    const auto G = this->group_;
-    CAFFE_ENFORCE(C % G == 0, "");
-    const auto K = C / G;
-    std::array<int, 2> dims = {K, G};
-    std::array<int, 2> axes = {1, 0};
-    for (auto i = 0; i < dY.size(); i += C) {
-      // Transpose each C = KxG matrix
-      math::Transpose(
-          2,
-          dims.data(),
-          axes.data(),
-          dY.template data<float>() + i,
-          dX->template mutable_data<float>() + i,
-          &context_);
-    }
-    return true;
+  bool RunOnDevice() override {
+    return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
+                                        : RunOnDeviceWithOrderNHWC();
   }
+
+  bool RunOnDeviceWithOrderNCHW();
+
+  bool RunOnDeviceWithOrderNHWC();
+
+ private:
+  const StorageOrder order_;
+  const int group_;
 };
+
 } // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_CHANNEL_SHUFFLE_OP_H_
diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu
index 447ba55d3436f..ad9a0ab4e79d7 100644
--- a/caffe2/operators/channel_shuffle_op_gpu.cu
+++ b/caffe2/operators/channel_shuffle_op_gpu.cu
@@ -42,7 +42,7 @@ __global__ void ChannelShuffleNHWCKernel(
 }
 
 template <>
-bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
+bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& X = Input(0);
   auto* Y = Output(0);
   Y->ResizeLike(X);
@@ -61,7 +61,7 @@ bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
 }
 
 template <>
-bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
+bool ChannelShuffleOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& X = Input(0);
   auto* Y = Output(0);
   Y->ResizeLike(X);
@@ -79,7 +79,7 @@ bool ChannelShuffleOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
 }
 
 template <>
-bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
+bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& dY = Input(0);
   auto* dX = Output(0);
   dX->ResizeLike(dY);
@@ -98,7 +98,7 @@ bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNCHW() {
 }
 
 template <>
-bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
+bool ChannelShuffleGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& dY = Input(0);
   auto* dX = Output(0);
   dX->ResizeLike(dY);
@@ -115,8 +115,9 @@ bool ChannelShuffleGradientOp<CUDAContext>::RunOnDeviceWithOrderNHWC() {
   return true;
 }
 
-REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp<CUDAContext>);
+REGISTER_CUDA_OPERATOR(ChannelShuffle, ChannelShuffleOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     ChannelShuffleGradient,
-    ChannelShuffleGradientOp<CUDAContext>);
+    ChannelShuffleGradientOp<float, CUDAContext>);
+
 } // namespace caffe2
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
index 93b9328950f34..e17d7a736063c 100644
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -11,38 +11,49 @@
 
 
 class ChannelShuffleOpsTest(hu.HypothesisTestCase):
-    @given(
-        channels_per_group=st.integers(min_value=1, max_value=5),
-        groups=st.integers(min_value=1, max_value=5),
-        n=st.integers(min_value=1, max_value=2),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        **hu.gcs)
-    def test_channel_shuffle(self, channels_per_group, groups, n, order, gc, dc):
-        X = np.random.randn(
-            n, channels_per_group * groups, 5, 6).astype(np.float32)
-        if order == "NHWC":
-            # NCHW -> NHWC
-            X = X.transpose((0, 2, 3, 1))
-
-        op = core.CreateOperator("ChannelShuffle", ["X"], ["Y"],
-                                 group=groups, kernel=1, order=order,
-                                 device_option=gc)
+    def _channel_shuffle_nchw_ref(self, X, group):
+        dims = X.shape
+        N = dims[0]
+        C = dims[1]
+        G = group
+        K = int(C / G)
+        X = X.reshape(N, G, K, -1)
+        Y = np.transpose(X, axes=(0, 2, 1, 3))
+        return [Y.reshape(dims)]
+
+    def _channel_shuffle_nhwc_ref(self, X, group):
+        dims = X.shape
+        N = dims[0]
+        C = dims[-1]
+        G = group
+        K = int(C / G)
+        X = X.reshape(N, -1, G, K)
+        Y = np.transpose(X, axes=(0, 1, 3, 2))
+        return [Y.reshape(dims)]
+
+    @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5),
+           H=st.integers(1, 5), W=st.integers(1, 5),
+           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc):
+        C = G * K
+        if order == "NCHW":
+            X = np.random.randn(N, C, H, W).astype(np.float32)
+        else:
+            X = np.random.randn(N, H, W, C).astype(np.float32)
+
+        op = core.CreateOperator(
+            "ChannelShuffle",
+            ["X"],
+            ["Y"],
+            group=G,
+            order=order,
+        )
 
         def channel_shuffle_ref(X):
-            if order == "NHWC":
-                # NHWC -> NCHW
-                X = X.transpose((0, 3, 1, 2))
-            Y_r = X.reshape(X.shape[0],
-                            groups,
-                            X.shape[1] // groups,
-                            X.shape[2],
-                            X.shape[3])
-            Y_trns = Y_r.transpose((0, 2, 1, 3, 4))
-            Y_reshaped = Y_trns.reshape(X.shape)
-            if order == "NHWC":
-                # NCHW -> NHWC
-                Y_reshaped = Y_reshaped.transpose((0, 2, 3, 1))
-            return (Y_reshaped,)
+            if order == "NCHW":
+                return self._channel_shuffle_nchw_ref(X, G)
+            else:
+                return self._channel_shuffle_nhwc_ref(X, G)
 
         self.assertReferenceChecks(gc, op, [X], channel_shuffle_ref)
         self.assertGradientChecks(gc, op, [X], 0, [0])
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
index 3461261c2221c..caa9121e924a0 100644
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -40,7 +40,7 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon):
         Y = gamma * (X - mu) / std + beta
         return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
 
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(2, 2),
+    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
            H=st.integers(2, 5), W=st.integers(2, 5),
            epsilon=st.floats(min_value=1e-5, max_value=1e-4),
            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 6ebf41ab7cba1..6aae82ea6554d 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -11,7 +11,6 @@
 //     platforms, it allows one to quickly port Caffe2 to different platforms
 //     where BLAS may not be present.
 
-#include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 
 #include <algorithm>
@@ -30,6 +29,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/utils/cpu_neon.h"
+#include "caffe2/utils/eigen_utils.h"
 
 #include "Eigen/Core"
 #include "Eigen/Dense"
@@ -2476,7 +2476,7 @@ namespace {
 
 #ifdef CAFFE2_USE_HPTT
 
-bool TryTransposeWithHPTT(
+bool TransposeWithHPTT(
     const int ndim,
     const int* dims,
     const int* axes,
@@ -2511,6 +2511,62 @@ bool TryTransposeWithHPTT(
 
 #endif // CAFFE2_USE_HPTT
 
+template <typename T>
+void Tranpose2D(const int rows, const int cols, const T* X, T* Y);
+
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func)                          \
+  template <>                                                            \
+  void Tranpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
+    Func('R', 'T', rows, cols, T(1), X, cols, Y, rows);                  \
+  }
+DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy);
+DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy);
+#undef DELEGATE_TRANSPOSE_2D_FUNCTION
+
+#endif // CAFFE2_USE_MKL
+
+#define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T)                               \
+  template <>                                                            \
+  void Tranpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
+    EigenMatrixMap<T>(Y, rows, cols) =                                   \
+        ConstEigenMatrixMap<T>(X, cols, rows).transpose();               \
+  }
+
+#ifndef CAFFE2_USE_MKL
+
+template <>
+void Tranpose2D<float>(
+    const int rows,
+    const int cols,
+    const float* X,
+    float* Y) {
+#ifdef CAFFE2_USE_HPTT
+  const std::array<int, 2> dims = {rows, cols};
+  const std::array<int, 2> axes = {1, 0};
+  if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) {
+    return;
+  }
+#endif // CAFFE2_USE_HPTT
+  EigenMatrixMap<float>(Y, rows, cols) =
+      ConstEigenMatrixMap<float>(X, cols, rows).transpose();
+}
+
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(double)
+
+#endif // CAFFE2_USE_MKL
+
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(int)
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(TIndex)
+#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(long)
+#endif
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t)
+CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t)
+
+#undef CAFFE2_SPECIALIZED_TRANSPOSE_2D
+
 std::vector<int>
 ComputeXStrides(const int ndim, const int* dims, const int* axes) {
   std::vector<int> x_strides(ndim);
@@ -2527,7 +2583,7 @@ ComputeXStrides(const int ndim, const int* dims, const int* axes) {
 }
 
 template <typename T>
-void TransposeCPUImpl(
+void TransposeND(
     const int ndim,
     const int* dims,
     const int* axes,
@@ -2547,10 +2603,6 @@ void TransposeCPUImpl(
   const int itr_axes = ndim - num_shared_idx;
   const int num_blocks = std::accumulate(
       Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>());
-  if (ndim < 2 || itr_axes == 0) {
-    std::memcpy(Y, X, num_blocks * block_size * sizeof(T));
-    return;
-  }
   const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes);
   std::vector<int> index(itr_axes, 0);
   for (int Y_index = 0; Y_index < num_blocks; ++Y_index) {
@@ -2568,24 +2620,53 @@ void TransposeCPUImpl(
   }
 }
 
-} // namespace
+template <typename T>
+void TransposeCPUImpl(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    const T* X,
+    T* Y) {
+  if (utils::IsIdentityPermutation(ndim, axes)) {
+    const int size =
+        std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
+    std::memcpy(Y, X, size * sizeof(T));
+    return;
+  }
+  if (ndim == 2) {
+    Tranpose2D<T>(dims[0], dims[1], X, Y);
+  } else {
+    TransposeND<T>(ndim, dims, axes, X, Y);
+  }
+}
 
 template <>
-void Transpose<float, CPUContext>(
+void TransposeCPUImpl(
     const int ndim,
     const int* dims,
     const int* axes,
     const float* X,
-    float* Y,
-    CPUContext* /* context */) {
-#ifdef CAFFE2_USE_HPTT
-  if (TryTransposeWithHPTT(ndim, dims, axes, X, Y)) {
+    float* Y) {
+  if (utils::IsIdentityPermutation(ndim, axes)) {
+    const int size =
+        std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
+    std::memcpy(Y, X, size * sizeof(float));
     return;
   }
-#endif // CAFFE2_USE_HPTT
-  TransposeCPUImpl(ndim, dims, axes, X, Y);
+  if (ndim == 2) {
+    Tranpose2D<float>(dims[0], dims[1], X, Y);
+  } else {
+#ifdef CAFFE2_USE_HPTT
+    if (TransposeWithHPTT(ndim, dims, axes, X, Y)) {
+      return;
+    }
+#endif
+    TransposeND<float>(ndim, dims, axes, X, Y);
+  }
 }
 
+} // namespace
+
 #define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
   template <>                                 \
   void Transpose<T, CPUContext>(              \
@@ -2597,6 +2678,7 @@ void Transpose<float, CPUContext>(
       CPUContext* /* context */) {            \
     TransposeCPUImpl(ndim, dims, axes, X, Y); \
   }
+CAFFE2_SPECIALIZED_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_TRANSPOSE(int)
 CAFFE2_SPECIALIZED_TRANSPOSE(TIndex)

From 769cb5a6405b39a0678e6bc4f2d6fea62e0d3f12 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 21 Jul 2018 21:37:30 -0700
Subject: [PATCH 19/23] Add new ways of matching nodes with schemas in the JIT
 (#9567)

Summary:
**REVIEW LAST COMMIT ONLY**

As discussed in our yesterday's meeting. Nodes can be now matched to particular overloads using the `matches(...)` function:
```cpp
n->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")
```

This also changes the shape prop and peephole passes to use those functions for matching. This fixes a few bugs, makes them much more robust, and prepares us for removal of attributes.

zdevito
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9567

Reviewed By: zdevito

Differential Revision: D8938482

Pulled By: apaszke

fbshipit-source-id: eb2382eeeae99692aada2d78d5d0c87c8ef1545e
---
 aten/src/ATen/ExpandUtils.cpp            |  12 +-
 aten/src/ATen/ExpandUtils.h              |   3 +-
 aten/src/ATen/native/TensorShape.cpp     |   2 +-
 torch/csrc/jit/autodiff.cpp              |   8 +-
 torch/csrc/jit/function_schema.h         |  33 +-
 torch/csrc/jit/ir.cpp                    |  54 ++-
 torch/csrc/jit/ir.h                      |  15 +-
 torch/csrc/jit/operator.cpp              |  86 +++-
 torch/csrc/jit/operator.h                |   9 +-
 torch/csrc/jit/passes/peephole.cpp       |  94 ++---
 torch/csrc/jit/passes/shape_analysis.cpp | 505 ++++++++++++-----------
 11 files changed, 476 insertions(+), 345 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index f4a0ce4e53bad..f4114c6a3715c 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -29,11 +29,13 @@ std::vector<int64_t> infer_size(IntList a, IntList b) {
 }
 
 std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
-    const Tensor& tensor,
+    IntList tensor_sizes,
+    IntList tensor_strides,
     IntList sizes) {
   int64_t ndim = sizes.size();
+  int64_t tensor_dim = tensor_sizes.size();
 
-  if (tensor.dim() == 0) {
+  if (tensor_dim == 0) {
     std::vector<int64_t> expandedStrides(ndim, 0);
     return std::tuple<std::vector<int64_t>, std::vector<int64_t>>(
         sizes.vec(), expandedStrides);
@@ -44,9 +46,9 @@ std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
   // create a new geometry for the tensors
   for (int64_t i = ndim - 1; i >= 0; --i) {
     int64_t offset = ndim - 1 - i;
-    int64_t dim = tensor.dim() - 1 - offset;
-    int64_t size = (dim >= 0) ? tensor.sizes()[dim] : 1;
-    int64_t stride = (dim >= 0) ? tensor.strides()[dim]
+    int64_t dim = tensor_dim - 1 - offset;
+    int64_t size = (dim >= 0) ? tensor_sizes[dim] : 1;
+    int64_t stride = (dim >= 0) ? tensor_strides[dim]
                                 : expandedSizes[i + 1] * expandedStrides[i + 1];
     int64_t targetSize = sizes[i];
     if (targetSize == -1) {
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 2080e56a4e78b..4ed5d6254320f 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -10,7 +10,8 @@
 namespace at {
 
 AT_API std::vector<int64_t> infer_size(IntList a, IntList b);
-std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(const Tensor &tensor, IntList sizes);
+std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(
+    IntList tensor_sizes, IntList tensor_strides, IntList sizes);
 
 // avoid copy-construction of Tensor by using a reference_wrapper.
 inline void check_defined(std::initializer_list<std::reference_wrapper<const Tensor>> tensors, const char *api_name) {
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 13887a52633bc..db6a78cbc1652 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -126,7 +126,7 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) {
 
   std::vector<int64_t> expandedSizes;
   std::vector<int64_t> expandedStrides;
-  std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self, size);
+  std::tie(expandedSizes, expandedStrides) = inferExpandGeometry(self.sizes(), self.strides(), size);
 
   return self.as_strided(expandedSizes, expandedStrides);
 }
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 1b7784916910e..a91532f5af15d 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -90,7 +90,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         } else if (node->hasAttribute(attr::alpha)) {
           return {grads.at(0), grads.at(0) * at::Scalar(node->t(attr::alpha))};
         } else {
-          return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)};
+          return {grads.at(0), nullptr, grads.at(0) * node->input(attr::alpha)};
         }
       case aten::sub:
         // o = self - alpha*other
@@ -99,7 +99,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         } else if (node->hasAttribute(attr::alpha)) {
           return {grads.at(0), -grads.at(0) * at::Scalar(node->t(attr::alpha))};
         } else {
-          return {grads.at(0), nullptr, grads.at(0) * node->getValue(attr::alpha)};
+          return {grads.at(0), nullptr, grads.at(0) * node->input(attr::alpha)};
         }
       case aten::mul:
         // o = self * other
@@ -119,7 +119,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         return {grads.at(0) * (outputs.at(0))};
       case aten::chunk:
       case aten::split:
-        return {SymbolicVariable::cat(grads, node->getValue(attr::dim))};
+        return {SymbolicVariable::cat(grads, node->input(attr::dim))};
       case aten::t:
         return {grads.at(0).t()};
       case aten::neg:
@@ -130,7 +130,7 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
       case aten::type_as:
         return {grads.at(0).type_as(inputs.at(0))};
       case aten::unsqueeze:
-        return {grads.at(0).squeeze(node->getValue(attr::dim))};
+        return {grads.at(0).squeeze(node->input(attr::dim))};
       case aten::mm: {
         SymbolicVariable dmat1, dmat2;
         if (auto type = inputs.at(0).value()->type()->cast<TensorType>()) {
diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h
index 99222e88af25f..ec56f6144bfca 100644
--- a/torch/csrc/jit/function_schema.h
+++ b/torch/csrc/jit/function_schema.h
@@ -85,21 +85,30 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
 inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
   // eventually this should look almost identical to python arg parser, but
   // it is simpler for now to work directly on this schema
-  auto emitList = [&](const std::vector<Argument>& args) {
+
+  out << schema.name;
+  out << "(";
+
+  bool seen_kwarg_only = false;
+  for(size_t i = 0; i < schema.arguments.size(); ++i) {
+    if (i > 0) out << ", ";
+    if (schema.arguments[i].kwarg_only && !seen_kwarg_only) {
+      out << "*, ";
+      seen_kwarg_only = true;
+    }
+    out << schema.arguments[i];
+  }
+
+  out << ") -> ";
+  if (schema.returns.size() == 1) {
+    out << schema.returns.at(0).type->str();
+  } else if (schema.returns.size() > 1) {
     out << "(";
-    for(size_t i = 0; i < args.size(); ++i) {
-      if(i > 0)
-        out << ", ";
-      out << args[i];
+    for (size_t i = 0; i < schema.returns.size(); ++i) {
+      if (i > 0) out << ", ";
+      out << schema.returns[i].type->str();
     }
     out << ")";
-  };
-
-  out << schema.name;
-  emitList(schema.arguments);
-  if(schema.returns.size() > 1) {
-    out << " -> ";
-    emitList(schema.returns);
   }
   return out;
 }
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 39de66680b9ca..b68cec65cbf20 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -642,6 +642,10 @@ at::optional<T> Node::get(Symbol name) {
     return getattr<T>()(this, name);
   }
   auto inp = findInput(name);
+  const Argument & arg = inp.second;
+  if (!inp.first) {
+    return tensor_as<T>(arg.default_value.value());
+  }
   Node *producer = inp.first->node();
   if (producer->kind() != prim::Constant) return at::nullopt;
   auto value = producer->t(attr::value);
@@ -655,25 +659,30 @@ template at::optional<std::vector<int64_t>> Node::get(Symbol name);
 
 at::optional<IValue> Node::get(Symbol name) {
   // TODO (apaszke): remove once tracer and compiler stop emitting attributes
-  if (hasAttributes()) {
-    throw std::runtime_error("IValue Node::get() not implemented for the attribute case");
+  if (hasAttribute(name)) {
+    switch (kindOf(name)) {
+      case AttributeKind::i:
+        return IValue{as_tensor(i(name))};
+      case AttributeKind::t:
+        return IValue{as_tensor(t(name))};
+      case AttributeKind::is:
+        return IValue{as_tensor(is(name))};
+      default:
+        throw std::runtime_error("get() NYI");
+    }
   }
   auto inp = findInput(name);
+  const Argument & arg = inp.second;
+  if (!inp.first) {
+    return IValue{arg.default_value.value()};
+  }
   Node * producer = inp.first->node();
   if (producer->kind() != prim::Constant) return at::nullopt;
   auto value = producer->t(attr::value);
-  const Argument & arg = inp.second;
-  if (arg.type->isSubtypeOf(*DynamicType::get())) {
-    return IValue{std::move(value)};
-  } else if (arg.type->isSubtypeOf(*IntType::get())) {
-    return IValue{tensor_as<int64_t>(std::move(value))};
-  } else if (arg.type->isSubtypeOf(*FloatType::get())) {
-    return IValue{tensor_as<double>(std::move(value))};
-  }
-  throw std::runtime_error("Unsupported case in Node::get! File a bug report.");
+  return IValue{std::move(value)};
 }
 
-Value* Node::getValue(Symbol name) {
+Value* Node::input(Symbol name) {
   // TODO (apaszke): remove once tracer and compiler stop emitting attributes
   if (hasAttribute(name)) {
     switch (kindOf(name)) {
@@ -687,9 +696,13 @@ Value* Node::getValue(Symbol name) {
         throw std::runtime_error("getValue() NYI");
     }
   }
-  return findInput(name).first;
+  auto inp = findInput(name);
+  if (inp.first) return inp.first;
+  return owningGraph()->insertConstant(inp.second.default_value.value());
 }
 
+// XXX: the first coordinate can be a nullptr, which means that you should use
+// the default value for this arg, because it's optional and missing
 std::pair<Value*, const Argument&> Node::findInput(Symbol name) {
   if (!schema_) {
     findSchema();
@@ -700,13 +713,26 @@ std::pair<Value*, const Argument&> Node::findInput(Symbol name) {
     const auto & arg = schema_->arguments[i];
     if (hasAttributeS(arg.name)) continue;
     if (arg.name == name_str) {
-      return std::pair<Value*, const Argument&>(input(input_i), arg);
+      if (input_i < inputs().size()) {
+        return std::pair<Value*, const Argument&>(input(input_i), arg);
+      } else {
+        JIT_ASSERT(arg.default_value);
+        return std::pair<Value*, const Argument&>(nullptr, arg);
+      }
     }
     input_i++;
   }
   throw std::runtime_error(std::string("Couldn't find an argument called ") + name.toQualString());
 }
 
+bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inputs) {
+  if (!sig(signature_literal).matches(this)) return false;
+  for (Symbol s : const_inputs) {
+    if (!is_constant(s)) return false;
+  }
+  return true;
+}
+
 void Node::findSchema() {
   schema_ = &getOperatorFor(this).schema;
 }
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 3595b83aefda6..153828497d1bf 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -398,7 +398,12 @@ struct Node : public Attributes<Node> {
   template<typename T>
   at::optional<T> get(Symbol name);
   at::optional<IValue> get(Symbol name);
-  Value* getValue(Symbol name);
+  Value* input(Symbol name);
+
+  // Returns true if the value of input name is statically known
+  bool is_constant(Symbol name) {
+    return static_cast<bool>(get(name));
+  }
 
   // Graphs
 
@@ -657,6 +662,14 @@ struct Node : public Attributes<Node> {
     return static_cast<T*>(this);
   }
 
+  // XXX: this function is meant to be used with string literals only!
+  bool matches(const char *signature_literal, at::ArrayRef<Symbol> const_inputs={});
+
+  const FunctionSchema& schema() {
+    if (!schema_) findSchema();
+    return *schema_;
+  }
+
   virtual ~Node() {}
 private:
   std::pair<Value*, const Argument&> findInput(Symbol name);
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 90c43fe7145dd..652eb90bf3797 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -223,15 +223,80 @@ struct SchemaParser {
 
 namespace {
 
+std::string canonicalSchemaString(const FunctionSchema& schema) {
+  std::ostringstream out;
+
+  out << schema.name;
+  out << "(";
+
+  bool seen_kwarg_only = false;
+  for(size_t i = 0; i < schema.arguments.size(); ++i) {
+    if (i > 0) out << ", ";
+    if (schema.arguments[i].kwarg_only && !seen_kwarg_only) {
+      out << "*, ";
+      seen_kwarg_only = true;
+    }
+    const auto & arg = schema.arguments[i];
+    out << arg.type->str() << " " << arg.name;
+  }
+
+  out << ") -> ";
+  if (schema.returns.size() == 1) {
+    out << schema.returns.at(0).type->str();
+  } else if (schema.returns.size() > 1) {
+    out << "(";
+    for (size_t i = 0; i < schema.returns.size(); ++i) {
+      if (i > 0) out << ", ";
+      out << schema.returns[i].type->str();
+    }
+    out << ")";
+  }
+  return out.str();
+}
+
 using OperatorMap = std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>>;
 struct OperatorRegistry  {
   OperatorMap operators;
   std::mutex lock;
+  // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls.
+  // Basically, every function schema is assigned a unique string you can use to match it. However,
+  // parsing those strings or comparing and hashing them character by character would be very slow, so
+  // we use a trick here! Every string literal in your program is guaranteed to have static storage
+  // duration and so its address won't change at runtime. This allows us to memoize answerts for every
+  // pointer, which is done by the operators_by_sig_literal map. Still, this map is initially
+  // empty, and so we still need to do the complete string matching at the first time, which is implemented
+  // by performing a lookup in the operators_by_sig map.
+  std::unordered_map<std::string, std::shared_ptr<Operator>> operators_by_sig;
+  std::unordered_map<const char *, std::shared_ptr<Operator>> operators_by_sig_literal;
   void registerOperator(Operator&& op){
     std::lock_guard<std::mutex> guard(lock);
+
     Symbol sym = Symbol::fromQualString(op.schema.name);
-    operators[sym].push_back(std::make_shared<Operator>(std::move(op)));
+    auto op_ptr = std::make_shared<Operator>(std::move(op));
+
+    operators[sym].push_back(op_ptr);
+
+    operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr;
   }
+
+  Operator& lookupByLiteral(const char * name) {
+    auto it = operators_by_sig_literal.find(name);
+    if (it == operators_by_sig_literal.end()) {
+      auto op_ptr_it = operators_by_sig.find(name);
+      // Handy debugging code that dumps all operators we know about on mismatch
+#if 0
+      if (op_ptr_it == operators_by_sig.end()) {
+        for (auto & entry : operators_by_sig) {
+          std::cout << entry.first << std::endl;
+        }
+      }
+#endif
+      JIT_ASSERTM(op_ptr_it != operators_by_sig.end(), "Couldn't find an operator for %s", name);
+      it = operators_by_sig_literal.emplace_hint(it, name, op_ptr_it->second);
+    }
+    return *it->second;
+  }
+
   const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
     std::lock_guard<std::mutex> guard(lock);
     static std::vector<std::shared_ptr<Operator>> empty;
@@ -242,19 +307,23 @@ struct OperatorRegistry  {
   }
 };
 
-OperatorRegistry& getRegsitry() {
+OperatorRegistry& getRegistry() {
   static OperatorRegistry r;
   return r;
 }
 
-}
+} // anonymous namespace
 
 void registerOperator(Operator&& op) {
-  getRegsitry().registerOperator(std::move(op));
+  getRegistry().registerOperator(std::move(op));
 }
 
 const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name) {
-  return getRegsitry().getOperators(name);
+  return getRegistry().getOperators(name);
+}
+
+Operator& sig(const char *signature) {
+  return getRegistry().lookupByLiteral(signature);
 }
 
 FunctionSchema parseSchema(const std::string& schema) {
@@ -293,7 +362,10 @@ bool typeMatches(TypePtr actual, TypePtr formal) {
   return false;
 }
 
-bool Operator::matchesNode(Node* node) const {
+bool Operator::matches(Node* node) const {
+  if (node->kind().toQualString() != schema.name) {
+    return false;
+  }
   size_t attributes_size = node->numAttributes();
   size_t attributes_seen = 0;
   auto inputs_size = node->inputs().size();
@@ -351,7 +423,7 @@ bool Operator::matchesNode(Node* node) const {
 std::shared_ptr<Operator> findOperatorFor(Node* node) {
   const auto& candidates = getAllOperatorsFor(node->kind());
   for(const auto& candidate : candidates) {
-    if(candidate->matchesNode(node)) {
+    if(candidate->matches(node)) {
       return candidate;
     }
   }
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
index 9db66cd4c1f7d..50ec147090be3 100644
--- a/torch/csrc/jit/operator.h
+++ b/torch/csrc/jit/operator.h
@@ -32,11 +32,11 @@ struct Operator {
 
   FunctionSchema schema;
 
-  bool matchesNode(Node* n) const;
+  bool matches(Node* n) const;
   // Operators have different versions depending on if some inputs are encoded
   // as attributes or inputs. This function returns the right Operation function,
   // given a node encoded for one variant.
-  // Behavior is undefined if matchesNode(n) == false
+  // Behavior is undefined if matches(n) == false
   Operation selectVariant(Node* n) const {
     if(n->hasAttributes()) {
       JIT_ASSERT(op_const_attributes != nullptr);
@@ -55,13 +55,16 @@ std::shared_ptr<Operator> findOperatorFor(Node* node);
 const Operator& getOperatorFor(Node* node);
 
 inline Operation getOperation(Node* node) {
-  // note: getOperatorFor ensures that getOperatorFor(node).matchesNode(node) == true
+  // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true
   // so the call to selectVariant is always valid.
   return getOperatorFor(node).selectVariant(node);
 }
 
 void registerOperator(Operator&& op);
 
+// XXX: this function is meant to be used with string literals only!
+Operator& sig(const char *signature_literal);
+
 struct RegisterOperators {
   RegisterOperators(std::vector<Operator> operators) {
     for(Operator& o : operators) {
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index 5fd438280a35e..feebbcf2fd505 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -16,67 +16,59 @@ namespace torch { namespace jit {
 // TODO: Decide what kind of fixed point strategy we will have
 void PeepholeOptimize(Block * block) {
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
-    auto* n = *it;
+    auto* node = *it;
 
-    for (Block * sub_block : n->blocks()) {
+    for (Block * sub_block : node->blocks()) {
         PeepholeOptimize(sub_block);
     }
 
     // XXX: remember that if you want to simplify an expression by combining multiple nodes
     // into a different one, then you need to check that they all belong to the given block
-    switch (n->kind()) {
-      case aten::expand: {
-        // Eliminate redundant expand
-        if (!n->input()->isTensor()) break;
-        // the sizes are dynamic
-        if(n->inputs().size() != 1) break;
-        if (n->get<std::vector<int64_t>>(attr::size) == n->input()->type()->expect<TensorType>()->sizes()) {
-          n->output()->replaceAllUsesWith(n->input());
-          // Let DCE clean up any unused nodes at this point
+    if (node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor",
+        /*with_const=*/attr::size)) {
+      // x.expand(x.size()) == x
+      if (auto input_type = node->input(attr::self)->type()->cast<TensorType>()) {
+        auto expanded_sizes = node->get<std::vector<int64_t>>(attr::size);
+        if (expanded_sizes == input_type->sizes()) {
+          node->output()->replaceAllUsesWith(node->input());
         }
-      } break;
-      case aten::t: {
-        // x.t().t() == x
-        auto input_node = n->input()->node();
-        if (input_node->kind() == aten::t)  {
-          n->output()->replaceAllUsesWith(input_node->input());
-          // Let DCE clean up any unused nodes at this point
-        }
-      } break;
-      case aten::type_as: {
-        JIT_ASSERT(n->inputs().size() == 2);
-        Value *lhs = n->input(0);
-        Value *rhs = n->input(1);
-        // If LHS and RHS have the same static type, remove the type_as operator.
-        if (lhs->type()->kind() == TypeKind::TensorType &&
-            rhs->type()->kind() == TypeKind::TensorType) {
-           auto ltype = (*lhs->type()).cast<TensorType>();
-           auto rtype = (*rhs->type()).cast<TensorType>();
-           if(ltype->device() == rtype->device() &&
-              ltype->scalarType() == rtype->scalarType()) {
-              n->output()->replaceAllUsesWith(lhs);
-           }
-        }
-      } break;
-      case aten::add: {
-        // mm + add == addmm
-        if (n->inputs().size() == 2 &&
-            n->get<at::Tensor>(attr::alpha) &&
-            tensor_as<double>(*n->get<at::Tensor>(attr::alpha)) == 1. &&
-            n->input(1)->node()->kind() == aten::mm) {
-          WithInsertPoint guard(n);
+      }
+    } else if (node->matches("aten::t(Tensor self) -> Tensor")) {
+      // x.t().t() == x
+      Node *input_node = node->input()->node();
+      if (input_node->matches("aten::t(Tensor self) -> Tensor")) {
+        node->output()->replaceAllUsesWith(input_node->input());
+      }
+    } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+      // x.type_as(y) == x iff x.type() == y.type()
+      auto self_type = node->input(0)->type()->cast<TensorType>();
+      auto other_type = node->input(1)->type()->cast<TensorType>();
+      if (self_type && other_type &&
+          self_type->scalarType() == other_type->scalarType() &&
+          self_type->device() == other_type->device()) {
+        node->output()->replaceAllUsesWith(node->input(0));
+      }
+    } else if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+               /*with_const=*/attr::alpha)) {
+      // z + x.mm(y) == z.addmm(x, y) == x.mm(y) + z
+      if (tensor_as<double>(node->get<at::Tensor>(attr::alpha).value()) == 1.) {
+        // Look for mm from both sides of the add
+        for (size_t mm_side = 0; mm_side < 2; mm_side++) {
+          if (node->input(mm_side)->node()->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) {
+            WithInsertPoint guard(node);
 
-          auto input_node = n->input(1)->node();
-          SymbolicVariable mat(n->input(0));
-          SymbolicVariable mat1(input_node->input(0));
-          SymbolicVariable mat2(input_node->input(1));
-          SymbolicVariable addmm_value = mat.addmm(mat1, mat2);
+            auto mm_node = node->input(mm_side)->node();
+            SymbolicVariable add_mat(node->input(1 - mm_side));
+            SymbolicVariable mat1(mm_node->input(0));
+            SymbolicVariable mat2(mm_node->input(1));
+            SymbolicVariable addmm_value = add_mat.addmm(mat1, mat2);
 
-          // Copy shape information from output node
-          ((Value*)addmm_value)->copyMetadata(n->output());
-          n->output()->replaceAllUsesWith(addmm_value);
+            // Copy shape information from output node
+            ((Value*)addmm_value)->copyMetadata(node->output());
+            node->output()->replaceAllUsesWith(addmm_value);
+          }
         }
-      } break;
+      }
     }
   }
 }
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 5fe1358a0010b..1775a57326fc5 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -20,12 +20,20 @@ struct propagation_error : std::exception {};
 #define SHAPE_ASSERT(cond) if (!(cond)) throw propagation_error()
 
 namespace {
+
 void setDynamicType(Node * node) {
   for(auto o : node->outputs()) {
     o->setType(DynamicType::get());
   }
 }
 
+int64_t wrapDim(int64_t dim, at::IntList sizes) {
+  if (dim < 0) {
+    dim += sizes.size();
+  }
+  return dim;
+}
+
 at::Tensor representativeTensor(const TensorType * type) {
   auto backend = type->device() == -1 ? at::kCPU : at::kCUDA;
   at::DeviceGuard device_guard(type->device());
@@ -35,16 +43,39 @@ at::Tensor representativeTensor(const TensorType * type) {
 
 void PropagateShapeOnBlock(Block * block, bool insert_expands=true);
 
-std::pair<std::vector<TensorType*>, bool> gatherTypes(at::ArrayRef<Value*> values) {
-  std::vector<TensorType*> types;
-  bool present = true;
-  for(auto v : values) {
-    TensorType* type = v->type()->cast<TensorType>();
-    if(!type)
-      present = false;
-    types.push_back(type);
+at::optional<std::vector<TensorType*>> gatherTensorTypes(Node *node) {
+  std::vector<TensorType*> tensor_types;
+  tensor_types.reserve(node->inputs().size());
+  // TODO (apaszke): Remove once we stop using attributes
+  // XXX: we also make the exception for cat, because we need shape prop to work for it
+  // (we have tests). We'll have to remove the special case once we stop flattening lists into inputs.
+  if (node->hasAttributes() || node->kind() == aten::cat) {
+    std::vector<Value*> inputs = node->inputs();
+    if (node->kind() == aten::cat && inputs.back()->type()->isSubtypeOf(*IntType::get())) {
+      inputs.pop_back();
+    }
+    for (Value *v : inputs) {
+      TensorType* type = v->type()->cast<TensorType>();
+      if(!type) return at::nullopt;
+      tensor_types.push_back(type);
+    }
+  } else {
+    auto & schema = node->schema();
+    auto & args = schema.arguments;
+    // XXX: This gets triggered for nodes that have Tensor[] as arguments.
+    // Those are currently very annoying to handle, because the lists are simply
+    // inlined into the node inputs, so we bail out from shape propagation for now.
+    if (schema.is_vararg || args.size() != node->inputs().size()) {
+      return at::nullopt;
+    }
+    for (size_t i = 0; i < node->inputs().size(); ++i) {
+      if (!args[i].type->isSubtypeOf(*DynamicType::get())) continue;
+      TensorType *type = node->input(i)->type()->cast<TensorType>();
+      if (!type) return at::nullopt;
+      tensor_types.push_back(type);
+    }
   }
-  return std::make_pair(std::move(types), present);
+  return tensor_types;
 }
 
 bool mergeTypes(ArrayRef<Value*> lhs, ArrayRef<Value*> rhs, ArrayRef<Value*> outputs) {
@@ -65,9 +96,8 @@ bool mergeTypes(ArrayRef<Value*> lhs, ArrayRef<Value*> rhs, ArrayRef<Value*> out
 
 void PropagateShapeOnNode(Node * node, bool insert_expands=true);
 
-void broadcastPointwise(Node *node, std::vector<TensorType*>& types) {
-  JIT_ASSERT(types.size() == 2);
-  auto expected_size = at::infer_size(types[0]->sizes(), types[1]->sizes());
+void broadcastBinary(Node *node, std::vector<TensorType*>& types, size_t idx1, size_t idx2) {
+  auto expected_size = at::infer_size(types[idx1]->sizes(), types[idx2]->sizes());
   auto broadcast = [&](size_t input_idx) {
     TensorType* input_type = types.at(input_idx);
     if (input_type->sizes() == expected_size)
@@ -80,18 +110,42 @@ void broadcastPointwise(Node *node, std::vector<TensorType*>& types) {
     PropagateShapeOnNode(expand);
     node->replaceInput(input_idx, expand->output());
   };
-  broadcast(0);
-  broadcast(1);
-  types[0] = node->inputs().at(0)->type()->expect<TensorType>();
-  types[1] = node->inputs().at(1)->type()->expect<TensorType>();
+  broadcast(idx1);
+  broadcast(idx2);
+  types[0] = node->inputs().at(idx1)->type()->expect<TensorType>();
+  types[1] = node->inputs().at(idx2)->type()->expect<TensorType>();
 }
 
 void PropagateShapeOnNodeByRunningIt(Node* node, const std::vector<TensorType*>& types) {
   auto op = getOperation(node);
   Stack stack;
 
-  for(auto & type : types) {
-    stack.push_back(representativeTensor(type));
+  size_t types_i = 0;
+  // TODO (apaszke): remove once we stop using attributes
+  if (node->hasAttributes()) {
+    for (auto & type : types) {
+      stack.push_back(representativeTensor(type));
+    }
+  // TODO (apaszke): remove once aten::cat is saner (see first XXX in gatherTensorTypes)
+  } else if (node->kind() == aten::cat) {
+    for (auto & type : types) {
+      stack.push_back(representativeTensor(type));
+    }
+    stack.push_back(node->get(attr::dim).value());
+  } else {
+    JIT_ASSERT(node->schema().arguments.size() == node->inputs().size());
+    for (const auto & arg : node->schema().arguments) {
+      if (arg.type->isSubtypeOf(*DynamicType::get())) {
+        stack.emplace_back(representativeTensor(types[types_i++]));
+      } else {
+        auto maybe_val = node->get(Symbol::attr(arg.name));
+        if (!maybe_val) {
+          setDynamicType(node);
+          return;
+        }
+        stack.push_back(std::move(*maybe_val));
+      }
+    }
   }
 
   // XXX: we're not catching any exceptions from the op for now. This
@@ -101,15 +155,13 @@ void PropagateShapeOnNodeByRunningIt(Node* node, const std::vector<TensorType*>&
   op(stack);
 
   JIT_ASSERT(stack.size() == node->outputs().size());
-  for(size_t i = 0; i < stack.size(); ++i) {
+  for (size_t i = 0; i < stack.size(); ++i) {
     node->outputs()[i]->inferTypeFrom(stack[i].toTensor());
   }
 }
 
 void PropagateShapeOnNode(Node * node, bool insert_expands) {
-  using AKind = AttributeKind;
-  // These don't require the types and present flag. Return early after we
-  // process them
+  // These don't require the types, and have complicated schema. Return early after we process them.
   switch(node->kind()) {
     case prim::If: {
       auto then_block = node->blocks().at(0);
@@ -146,241 +198,202 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
       }
       return;
     }
-    default: ; // fall-through
+    case prim::NumToTensor:
+    case prim::TensorToNum: {
+      node->output()->setType(node->inputs()[0]->type());
+      return;
+    }
+    case prim::Constant: {
+      node->output()->inferTypeFrom(node->t(attr::value));
+      return;
+    }
+    case prim::PythonOp:
+    case prim::Print:
+    case prim::Undefined: {
+      setDynamicType(node);
+      return;
+    }
+    default:
+      break; // fall-through
   }
-  std::vector<TensorType*> types;
-  bool present;
-  // TODO (apaszke): This currently assumes that inputs are pretty much tensor-only. Need to fix that.
-  std::tie(types, present) = gatherTypes(node->inputs());
-  if(!present) {
+
+  auto maybe_tensor_types = gatherTensorTypes(node);
+  if (!maybe_tensor_types) {
     return setDynamicType(node);
   }
+  auto & tensor_types = *maybe_tensor_types;
 
-  bool handled = false;
-  // XXX: real attributes of node can be a superset of attrs
-  // XXX: if this returns true then you are obliged to set the types
-  auto check_overload = [&](size_t num_inputs, size_t num_outputs,
-                            std::vector<std::pair<AttributeKind,Symbol>> attrs) {
-    JIT_ASSERT(!handled);
-    if (node->inputs().size() != num_inputs) return false;
-    if (node->outputs().size() != num_outputs) return false;
-    for (auto & attr : attrs) {
-      if (!node->hasAttribute(attr.second)) return false;
-      if (node->kindOf(attr.second) != attr.first) return false;
-    }
-    handled = true;
-    return true;
-  };
-
-  // TODO (apaszke): Those currently assume that a lot of stuff is constant, which might
-  // not be the case if they are produced by script.
-  switch(node->kind()) {
-    // For expensive ops we can directly encode their shape propagation
-    // here, otherwise we fallback to running a fake version of the op
-    // to get a quick and dirty propagation.
-    case aten::add:
-    case aten::sub:
-    case aten::mul:
-    case aten::div:
-    case aten::pow:
-    case aten::min:
-    case aten::max:
-    case aten::lt:
-    case aten::le:
-    case aten::gt:
-    case aten::ge:
-    case aten::eq:
-    case aten::ne: {
-      if (node->inputs().size() == 2 && insert_expands) {
-        broadcastPointwise(node, types);
-      }
-      // NB: we don't handle the nodes in any other way, because the type casting
-      // logic in scalar cases is non-trivial. It's better to just run them.
-    } break;
-    case aten::neg: {
-      if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break;
-      node->output()->setType(types.at(0)->contiguous());
-    } break;
-    case aten::mm: {
-      if (!check_overload(/*num_inputs=*/2, /*num_outputs=*/1, {})) break;
-      auto lhs_type = types.at(0);
-      auto rhs_type = types.at(1);
-      SHAPE_ASSERT(lhs_type->sizes().size() == 2 && rhs_type->sizes().size() == 2);
-      node->output()->setType(std::make_shared<TensorType>(
-        lhs_type->scalarType(), lhs_type->device(),
-        at::IntList{lhs_type->sizes().at(0), rhs_type->sizes().at(1)}));
-    } break;
-    case aten::t: {
-      if (!check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) break;
-      auto tp = types.at(0);
-      auto sizes = tp->sizes();
-      auto strides = tp->strides();
-      SHAPE_ASSERT(sizes.size() == 2);
-      std::swap(sizes.at(0), sizes.at(1));
-      std::swap(strides.at(0), strides.at(1));
-      node->output()->setType(tp->withSizesStrides(sizes, strides));
-    } break;
-    case aten::narrow: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::i, attr::dim},
-                          {AKind::i, attr::length}})) {
-        auto tp = types.at(0);
-        auto sizes = tp->sizes();
-        int64_t dim = node->i(attr::dim);
-        int64_t length = node->i(attr::length);
-        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
-        sizes.at(dim) = length;
-        node->output()->setType(tp->withSizesStrides(sizes, tp->strides()));
-      }
-    } break;
-    case aten::sum: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::is, attr::dim},
-                          {AKind::i, attr::keepdim}})) {
-        auto tp = types.at(0);
-        auto sizes = tp->sizes();
-        auto dims = node->get<std::vector<int64_t>>(attr::dim).value();
-        bool keepdim = node->get<int64_t>(attr::keepdim).value();
-        std::reverse(dims.begin(), dims.end());
-        for (int64_t dim : dims) {
-          SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
-          if (keepdim) {
-            sizes.at(dim) = 1;
-          } else {
-            sizes.erase(sizes.begin() + dim);
-          }
-        }
-        node->output()->setType(tp->withSizes(sizes));
-      } else if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) {
-        node->output()->setType(types.at(0)->withSizes({}));
-      }
-    } break;
-    case aten::squeeze: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::i, attr::dim}})) {
-        auto tp = types.at(0);
-        auto sizes = tp->sizes();
-        auto strides = tp->strides();
-        int64_t dim = node->get<int64_t>(attr::dim).value();
-        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
-        if (sizes.at(dim) == 1) {
-          sizes.erase(sizes.begin() + dim);
-          strides.erase(strides.begin() + dim);
-        }
-        node->output()->setType(tp->withSizesStrides(sizes, strides));
+  // For expensive ops we can directly encode their shape propagation
+  // here, otherwise we fallback to running a fake version of the op
+  // to get a quick and dirty propagation.
+  if (insert_expands && (
+      node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
+      node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
+      node->matches("aten::mul(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::div(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::pow(Tensor self, Tensor exponent) -> Tensor") ||
+      node->matches("aten::min(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::max(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
+      node->matches("aten::ne(Tensor self, Tensor other) -> Tensor"))) {
+    // Binary broadcasting ops
+    // NB: we don't handle the nodes in any other way (note the lack of return!),
+    // because the type casting logic in scalar cases is non-trivial.
+    // It's better to just run them.
+    broadcastBinary(node, tensor_types, 0, 1);
+  } else if (node->matches("aten::neg(Tensor self) -> Tensor")) {
+    node->output()->setType(tensor_types.at(0)->contiguous());
+    return;
+  } else if (node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) {
+    auto lhs_type = tensor_types.at(0);
+    auto rhs_type = tensor_types.at(1);
+    SHAPE_ASSERT(lhs_type->sizes().size() == 2 && rhs_type->sizes().size() == 2);
+    node->output()->setType(std::make_shared<TensorType>(
+      lhs_type->scalarType(), lhs_type->device(),
+      at::IntList{lhs_type->sizes().at(0), rhs_type->sizes().at(1)}));
+    return;
+  } else if (node->matches("aten::t(Tensor self) -> Tensor")) {
+    auto tp = tensor_types.at(0);
+    auto sizes = tp->sizes();
+    auto strides = tp->strides();
+    SHAPE_ASSERT(sizes.size() == 2);
+    std::swap(sizes.at(0), sizes.at(1));
+    std::swap(strides.at(0), strides.at(1));
+    node->output()->setType(tp->withSizesStrides(sizes, strides));
+    return;
+  } else if (node->matches("aten::narrow(Tensor self, int dim, int start, int length) -> Tensor",
+                           /*with_const=*/{attr::dim, attr::length})) {
+    auto tp = tensor_types.at(0);
+    auto sizes = tp->sizes();
+    int64_t dim = node->get<int64_t>(attr::dim).value();
+    int64_t length = node->get<int64_t>(attr::length).value();
+    SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+    sizes.at(dim) = length;
+    node->output()->setType(tp->withSizesStrides(sizes, tp->strides()));
+    return;
+  } else if (node->matches("aten::sum(Tensor self) -> Tensor")) {
+    node->output()->setType(tensor_types.at(0)->withSizes({}));
+    return;
+  } else if (node->matches("aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor",
+             /*with_const=*/{attr::dim, attr::keepdim})) {
+    auto & tp = tensor_types.at(0);
+    auto sizes = tp->sizes();
+    auto dims = node->get<std::vector<int64_t>>(attr::dim).value();
+    bool keepdim = node->get<int64_t>(attr::keepdim).value();
+    std::reverse(dims.begin(), dims.end());
+    for (int64_t dim : dims) {
+      SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+      if (keepdim) {
+        sizes.at(dim) = 1;
+      } else {
+        sizes.erase(sizes.begin() + dim);
       }
-    } break;
-    case aten::unsqueeze: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::i, attr::dim}})) {
-        auto tp = types.at(0);
-        auto sizes = tp->sizes();
-        auto strides = tp->strides();
-        int64_t dim = node->get<int64_t>(attr::dim).value();
-        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) <= sizes.size());
-        sizes.insert(sizes.begin() + dim, 1);
-        strides.insert(strides.begin() + dim, 1);
-        node->output()->setType(tp->withSizesStrides(sizes, strides));
+    }
+    node->output()->setType(tp->withSizes(sizes));
+    return;
+  } else if (node->matches("aten::squeeze(Tensor self, int dim) -> Tensor", /*with_const=*/attr::dim)) {
+    auto & tp = tensor_types.at(0);
+    auto sizes = tp->sizes();
+    auto strides = tp->strides();
+    int64_t dim = wrapDim(node->get<int64_t>(attr::dim).value(), sizes);
+    SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < sizes.size());
+    if (sizes.at(dim) == 1) {
+      sizes.erase(sizes.begin() + dim);
+      strides.erase(strides.begin() + dim);
+    }
+    node->output()->setType(tp->withSizesStrides(sizes, strides));
+    return;
+  } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor", /*with_const=*/attr::dim)) {
+    auto & tp = tensor_types.at(0);
+    auto sizes = tp->sizes();
+    auto strides = tp->strides();
+    int64_t dim = wrapDim(node->get<int64_t>(attr::dim).value(), sizes);
+    SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) <= sizes.size());
+    int64_t new_stride = dim >= static_cast<int64_t>(sizes.size()) ? 1 : sizes.at(dim) * strides.at(dim);
+    sizes.insert(sizes.begin() + dim, 1);
+    strides.insert(strides.begin() + dim, new_stride);
+    node->output()->setType(tp->withSizesStrides(sizes, strides));
+    return;
+  } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor", /*with_const=*/attr::size)) {
+    auto sizes = node->get<std::vector<int64_t>>(attr::size).value();
+    bool inferred = false;
+    size_t inferred_idx;
+    int64_t size_product = 1;
+    for (size_t i = 0; i < sizes.size(); ++i) {
+      if (sizes[i] == -1) {
+        if (inferred) throw propagation_error();
+        inferred = true;
+        inferred_idx = i;
+      } else {
+        size_product *= sizes[i];
       }
-    } break;
-    case aten::view: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::is, attr::size}})) {
-        auto sizes = node->get<std::vector<int64_t>>(attr::size).value();
-        bool inferred = false;
-        size_t inferred_idx;
-        int64_t size_product = 1;
-        for (size_t i=0; i<sizes.size(); ++i) {
-          if (sizes[i] == -1) {
-            if (inferred) throw propagation_error();
-            inferred = true;
-            inferred_idx = i;
-          } else {
-            size_product *= sizes[i];
-          }
-        }
+    }
 
-        if (inferred) {
-          auto rep_ten = representativeTensor(types[0]);
-          SHAPE_ASSERT(size_product != 0);
-          int64_t inferred_size = rep_ten.numel() / size_product;
-          sizes[inferred_idx] = inferred_size;
-        }
-        node->output()->setType(types.at(0)->withSizes(sizes));
-      }
-    } break;
-    case aten::expand: {
-      if(check_overload(/*num_inputs=*/1, /*num_outputs=*/1,
-                         {{AKind::is, attr::size}})) {
-        // it is safe to run this, even if we have an integer input tensor
-        PropagateShapeOnNodeByRunningIt(node, types);
-      }
-    } break;
-    case aten::index_select: {
-      if(check_overload(/*num_inputs=*/2, /*num_outputs=*/1,
-                        {{AKind::i, attr::dim}})) {
-        auto ten = types.at(0);
-        auto index = types.at(1);
-        int64_t dim = node->get<int64_t>(attr::dim).value();
-        SHAPE_ASSERT(index->sizes().size() == 1);
-        SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < ten->sizes().size());
-        std::vector<int64_t> sizes = ten->sizes();
-        sizes[dim] = index->sizes()[0];
-        node->output()->setType(ten->withSizes(sizes));
-      }
-    } break;
-    case prim::Constant: {
-      node->output()->inferTypeFrom(node->t(attr::value));
-      handled = true;
-    } break;
-    case prim::TensorToNum:
-    case prim::NumToTensor: {
-      node->output()->setType(node->inputs()[0]->type());
-      handled = true;
-    } break;
-    case prim::Undefined: {
-      node->output()->setType(DynamicType::get());
-      handled = true;
-    } break;
-    case prim::PythonOp: {
-      setDynamicType(node);
-      handled = true;
-    } break;
-    case prim::Print: {
-      setDynamicType(node);
-      handled = true;
-    } break;
-    case onnx::Shape: {
-      if (check_overload(/*num_inputs=*/1, /*num_outputs=*/1, {})) {
-        std::vector<int64_t> dim_vec = {(int64_t)types.at(0)->sizes().size()};
-        at::IntList dims(dim_vec);
-        node->output()->setType(
-            std::make_shared<TensorType>(at::kLong, -1, dims));
-      }
-    } break;
-    case onnx::Reshape: {
-      setDynamicType(node);
-      handled = true;
+    if (inferred) {
+      SHAPE_ASSERT(size_product != 0);
+      size_t numel = 1;
+      for (int64_t s : tensor_types.at(0)->sizes())
+        numel *= s;
+      int64_t inferred_size = numel / size_product;
+      sizes[inferred_idx] = inferred_size;
+    }
+    node->output()->setType(tensor_types.at(0)->withSizes(sizes));
+    return;
+  } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+    if (tensor_types.at(0)->scalarType() == tensor_types.at(1)->scalarType()) {
+      node->output()->setType(node->input(attr::self)->type());
+    } else {
+      // This will be a copy, so the result will be contiguous
+      node->output()->setType(tensor_types.at(1)->withSizes(tensor_types.at(0)->sizes()));
     }
-    default: {
-    } break;
+    return;
+  } else if (node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor",
+             /*with_const=*/attr::size)) {
+    auto tp = tensor_types.at(0);
+    std::vector<int64_t> sizes, strides;
+    std::tie(sizes, strides) = at::inferExpandGeometry(
+        tp->sizes(), tp->strides(), node->get<std::vector<int64_t>>(attr::size).value());
+    node->output()->setType(tp->withSizesStrides(sizes, strides));
+    return;
+  } else if (node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor",
+             /*with_const=*/attr::dim)) {
+    auto ten = tensor_types.at(0);
+    auto index = tensor_types.at(1);
+    int64_t dim = node->get<int64_t>(attr::dim).value();
+    SHAPE_ASSERT(index->sizes().size() == 1);
+    SHAPE_ASSERT(dim >= 0 && static_cast<size_t>(dim) < ten->sizes().size());
+    std::vector<int64_t> sizes = ten->sizes();
+    sizes[dim] = index->sizes()[0];
+    node->output()->setType(ten->withSizes(sizes));
+    return;
+  } else if (node->kind() == onnx::Shape) {
+    SHAPE_ASSERT(node->inputs().size() == 1 && node->outputs().size() == 1);
+    std::vector<int64_t> dim_vec = {(int64_t)tensor_types.at(0)->sizes().size()};
+    at::IntList dims(dim_vec);
+    node->output()->setType(
+        std::make_shared<TensorType>(at::kLong, -1, dims));
+    return;
+  } else if (node->kind() == onnx::Reshape) {
+    setDynamicType(node);
+    return;
   }
 
-  // If we haven't manage to handle the op so far, we fall back to inferring the
+  // If we haven't managed to handle the op so far, we fall back to inferring the
   // shapes by doing an example run of the op (if we can).
-  if (!handled) {
-    // Integral typed inputs are often an indicator that we're indexing into
-    // a tensor, so we should special-case these ops in the shape propagation.
-    // Additionally, passing in a zero representative tensor into an integer
-    // division op causes divide-by-zero errors
-    bool shape_inferenceable = !std::any_of(types.begin(), types.end(), [](TensorType* t){
-      return at::isIntegralType(t->scalarType());
-    });
-    if (node->kind() == aten::type_as || shape_inferenceable ) {
-      PropagateShapeOnNodeByRunningIt(node, types);
-    } else {
-      setDynamicType(node);
-    }
+  // Integral typed inputs are often an indicator that we're indexing into
+  // a tensor, so we should special-case these ops in the shape propagation.
+  // Additionally, passing in a zero representative tensor into an integer
+  // division op causes divide-by-zero errors
+  bool shape_inferenceable = !std::any_of(tensor_types.begin(), tensor_types.end(), [](TensorType* t){
+    return at::isIntegralType(t->scalarType());
+  });
+  if (shape_inferenceable) {
+    PropagateShapeOnNodeByRunningIt(node, tensor_types);
+  } else {
+    setDynamicType(node);
   }
 }
 

From f3d72b2101d89fe9a9ab06e67e4c4af12d2e730f Mon Sep 17 00:00:00 2001
From: Di Yu <diyu@fb.com>
Date: Sun, 22 Jul 2018 00:08:34 -0700
Subject: [PATCH 20/23] Modify barrier net to allow better control over its
 initialization and execution in DPM (#9665)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9665

In data_parallel_model, we isolate synchronizing barrier init net into its own from the param_init_net, so that we could have finer granularity of control over the barrier net.

Reviewed By: andrewwdye

Differential Revision: D8375389

fbshipit-source-id: ce0c8c1c8e4bd82b7078a1b07abaced3f149d578
---
 caffe2/python/data_parallel_model.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index b49a7952710f3..60e5c39bed131 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -687,15 +687,18 @@ def _InitializeParamUpdate(gpu_id):
     ]
     _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec)
 
+def CreateNet(model, overwrite=False):
+    for net_iters in model._data_parallel_model_nets:
+        if isinstance(net_iters, tuple):
+            workspace.CreateNet(net_iters[0], overwrite=overwrite)
+        else:
+            workspace.CreateNet(net_iters, overwrite=overwrite)
+
 
 def RunInitNet(model):
     for init_net in model._data_parallel_model_init_nets:
         workspace.RunNetOnce(init_net)
-    for net_iters in model._data_parallel_model_nets:
-        if isinstance(net_iters, tuple):
-            workspace.CreateNet(net_iters[0])
-        else:
-            workspace.CreateNet(net_iters)
+    CreateNet(model)
 
 
 def RunWarmup(model):
@@ -720,8 +723,14 @@ def _AddBarrierToModelNets(model, barrier_net_timeout_sec):
         # (_DEFAULT_TIMEOUT_SEC).
         # We pass in model.param_init_net so that the barrier net can be run as
         # part of the param_init_net.
-        model._barrier_net = _CreateBarrierNet(model, model.param_init_net,
-                "pre_training", barrier_net_timeout_sec)
+
+        model._barrier_init_net = core.Net("barrier_init_net")
+
+        model._barrier_net = _CreateBarrierNet(model, model._barrier_init_net,
+        "pre_training", barrier_net_timeout_sec)
+
+        model._data_parallel_model_init_nets.insert(0, model._barrier_init_net)
+
         model._data_parallel_model_nets.insert(0, model._barrier_net)
 
 

From 1afdc57ed89984b704d0dc5357851cd9f8c50553 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Sun, 22 Jul 2018 09:02:58 -0700
Subject: [PATCH 21/23] Hide all other fields in THTensor (#9683)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9683

This pops off `refcount_`, `storage_`, `storage_offset_`; there are now no more direct accesses to these fields and we can make them private (with appropriate friending).

Stacked on #9561
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9591

Reviewed By: SsnL

Differential Revision: D8922246

Pulled By: ezyang

fbshipit-source-id: dfae023d790e29ce652e2eab9a1628bbe97b318d
---
 aten/src/ATen/templates/TensorDense.cpp       |  2 +-
 aten/src/ATen/templates/TensorDerived.cpp     | 10 ++-
 aten/src/TH/THTensor.cpp                      |  9 +--
 aten/src/TH/THTensor.h                        |  5 ++
 aten/src/TH/THTensor.hpp                      | 48 +++++++++---
 aten/src/TH/THTensorApply.h                   | 14 ++--
 aten/src/TH/THTensorDimApply.h                | 12 +--
 aten/src/TH/generic/THTensor.cpp              | 75 +++++++++----------
 aten/src/TH/generic/THTensorFastGetSet.hpp    | 20 ++---
 aten/src/TH/generic/THTensorLapack.cpp        |  8 +-
 aten/src/TH/generic/THTensorMath.cpp          |  4 +-
 aten/src/TH/generic/THTensorRandom.cpp        | 34 ++++-----
 aten/src/THC/THCTensor.cpp                    | 43 ++++++-----
 aten/src/THC/generic/THCTensor.cpp            | 39 +++++-----
 aten/src/THC/generic/THCTensorCopy.cpp        |  4 +-
 aten/src/THC/generic/THCTensorCopy.cu         |  2 +-
 aten/src/THC/generic/THCTensorMath.cu         |  4 +-
 aten/src/THC/generic/THCTensorMathMagma.cu    | 10 +--
 .../THCUNN/generic/SpatialConvolutionLocal.cu | 16 ++--
 .../THCUNN/generic/SpatialConvolutionMM.cu    |  6 +-
 .../src/THCUNN/generic/TemporalConvolution.cu | 48 ++++++------
 .../THCUNN/generic/TemporalRowConvolution.cu  |  6 +-
 aten/src/THNN/generic/SparseLinear.c          |  6 +-
 .../THNN/generic/SpatialConvolutionLocal.c    | 16 ++--
 aten/src/THNN/generic/SpatialConvolutionMM.c  | 14 ++--
 aten/src/THNN/generic/TemporalConvolution.c   | 48 ++++++------
 .../src/THNN/generic/TemporalRowConvolution.c | 14 ++--
 .../THNN/generic/VolumetricConvolutionMM.c    | 14 ++--
 28 files changed, 279 insertions(+), 252 deletions(-)

diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
index 9f977d50ead2b..1ca2cda09fa7d 100644
--- a/aten/src/ATen/templates/TensorDense.cpp
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -8,7 +8,7 @@ IntList ${Tensor}::strides() const {
 Scalar ${Tensor}::localScalar() {
   int64_t numel = ${THTensor}_nElement(${state,}tensor);
   AT_CHECK(numel == 1,"a Tensor with ", numel, " elements cannot be converted to Scalar");
-  return Scalar(${to_at_type}(${THStorage}_get(${state,}tensor->storage, tensor->storageOffset)));
+  return Scalar(${to_at_type}(${THStorage}_get(${state,} THTensor_getStoragePtr(tensor), tensor->storage_offset())));
 }
 std::unique_ptr<Storage> ${Tensor}::storage() {
   auto storage = ${THTensor}_storage(${state,}tensor);
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
index d89e84ee5e702..70f2cc260b62f 100644
--- a/aten/src/ATen/templates/TensorDerived.cpp
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -22,8 +22,9 @@ namespace at {
 : TensorImpl(&context->getType(Backend::${Backend},ScalarType::${ScalarName})),
   tensor(tensor),
   context(context) {}
+
 ${Tensor}::~${Tensor}() {
-  ${THTensor}_free(${state,} tensor);
+  if (tensor) tensor->release();
 }
 
 const char * ${Tensor}::toString() const {
@@ -46,13 +47,14 @@ const char * ${Tensor}::typeString() {
   return "${Type}";
 }
 void * ${Tensor}::unsafeGetTH(bool retain) {
-  if (retain)
-      ${THTensor}_retain(${state,} tensor);
+  if (retain) {
+    tensor->retain();
+  }
   return tensor;
 }
 
 void ${Tensor}::release_resources() {
-  ${THTensor}_free(${state,} tensor);
+  tensor->release();
   tensor = nullptr;
 }
 
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 48ddcd2c57ba1..5c6bdb48bd936 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -40,13 +40,8 @@
 
 void THTensor_free(THTensor *self)
 {
-  if(!self)
-    return;
-
-  if(--self->refcount == 0)
-  {
-    delete self;
-  }
+  if (!self) return;
+  self->release();
 }
 
 // On a high level,
diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h
index 3335a6f5d8cc5..217d1fdce1253 100644
--- a/aten/src/TH/THTensor.h
+++ b/aten/src/TH/THTensor.h
@@ -6,6 +6,11 @@
 
 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
 
+#ifdef __cplusplus
+struct THTensor;
+THStorage* THTensor_getStoragePtr(const THTensor* tensor);
+#endif
+
 /* basics */
 #include "generic/THTensor.h"
 #include "THGenerateAllTypes.h"
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index bc9f23ee5e3ce..0962803e3821a 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -12,26 +12,26 @@
 struct THTensor
 {
     THTensor(THStorage* storage)
-      : refcount(1)
-      , storage(storage)
-      , storageOffset(0)
+      : refcount_(1)
+      , storage_(storage)
+      , storage_offset_(0)
       , sizes_{0}
       , strides_{1}
       , dim_(1)
       {}
 
     ~THTensor() {
-      if (storage) {
-        THStorage_free(storage);
+      if (storage_) {
+        THStorage_free(storage_);
       }
     }
 
-    std::atomic<int> refcount;
+    std::atomic<int> refcount_;
 
     // Note: storage->size may be greater than the recorded size
     // of a tensor
-    THStorage *storage;
-    ptrdiff_t storageOffset;
+    THStorage *storage_;
+    ptrdiff_t storage_offset_;
 
     std::vector<int64_t> sizes_;
     std::vector<int64_t> strides_;
@@ -39,12 +39,12 @@ struct THTensor
 
     template <typename T>
     inline T * data() const {
-      return storage->data<T>() + storageOffset;
+      return storage_->data<T>() + storage_offset_;
     }
 
     template <typename T>
     inline T * unsafe_data() const {
-      return storage->unsafe_data<T>() + storageOffset;
+      return storage_->unsafe_data<T>() + storage_offset_;
     }
 
     // [NOTE: _dim() vs dim()]
@@ -58,6 +58,10 @@ struct THTensor
       return dim_;
     }
 
+    ptrdiff_t storage_offset() const {
+      return storage_offset_;
+    }
+
     // represents that numel() == 0.
     inline bool is_empty() const {
       for (int64_t i = 0; i < dim_; ++i) {
@@ -85,6 +89,16 @@ struct THTensor
     inline at::IntList strides() {
       return strides_;
     }
+
+    void retain() {
+      ++refcount_;
+    }
+
+    void release() {
+      if(--refcount_ == 0) {
+        delete this;
+      }
+    }
 };
 
 #include "generic/THTensorFastGetSet.hpp"
@@ -120,6 +134,20 @@ inline void THTensor_setStrideAtDim(THTensor* tensor, int dim, int64_t new_strid
   tensor->strides_[dim] = new_stride;
 }
 
+inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset) {
+  tensor->storage_offset_ = storage_offset;
+}
+
+// NB: Non-retaining
+inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
+  return tensor->storage_;
+}
+
+// NB: Steals ownership of storage
+inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
+  tensor->storage_ = storage;
+}
+
 TH_API void THTensor_free(THTensor *self);
 at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
                                                            at::IntList newshape);
diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h
index 514a4969df83e..7cf4f41c924ee 100644
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@@ -43,7 +43,7 @@
     TH_TENSOR_APPLY_hasFinished = 1; \
   else \
   { \
-    TENSOR##_data = TENSOR->storage->data<TYPE>()+TENSOR->storageOffset; \
+    TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset(); \
     TENSOR##_size = 1; \
     TENSOR##_stride = 1; \
     for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \
@@ -321,7 +321,7 @@
   ptrdiff_t TENSOR##Size = THTensor_(nElement)(TENSOR);                     \
   if(TENSOR##Contg){                                                        \
     ptrdiff_t iter = 0;                                                     \
-    TYPE *rp = TENSOR->storage->data<TYPE>()+TENSOR->storageOffset;         \
+    TYPE *rp = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+TENSOR->storage_offset();         \
     PRAGMA( omp parallel for if (TENSOR##Size > OMP_THRESHOLD * 10) firstprivate(rp) reduction(OPERATION) ) \
     for (iter = 0; iter < TENSOR##Size; iter++) { \
       TYPE *TENSOR##_data = rp+iter;                    \
@@ -365,8 +365,8 @@
 {                                                                                              \
   /* for advanced searching index*/                                                            \
   if( CONTIG1 && CONTIG2 ){                                                                    \
-    TYPE1 *rp = TENSOR1->storage->data<TYPE1>()+TENSOR1->storageOffset;                        \
-    TYPE2 *tp = TENSOR2->storage->data<TYPE2>()+TENSOR2->storageOffset;                        \
+    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset();                        \
+    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                        \
     ptrdiff_t iter = 0;                                                                        \
     if(tp != (TYPE2*)rp) {                                                                             \
       PRAGMA(ivdep) \
@@ -444,9 +444,9 @@
 {                                                                             \
   /* for adveanced searching index*/                                                                    \
   if(CONTIG1 && CONTIG2 && CONTIG3){                                                                    \
-    TYPE1 *rp = TENSOR1->storage->data<TYPE1>()+TENSOR1->storageOffset;                                 \
-    TYPE2 *tp = TENSOR2->storage->data<TYPE2>()+TENSOR2->storageOffset;                                 \
-    TYPE3 *srcp = TENSOR3->storage->data<TYPE3>()+TENSOR3->storageOffset;                               \
+    TYPE1 *rp = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+TENSOR1->storage_offset();                                 \
+    TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset();                                 \
+    TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset();                               \
     ptrdiff_t iter = 0;\
     if(tp != (TYPE2*)rp) {                                                                             \
       PRAGMA(ivdep) \
diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h
index e85bd0e9137e8..e45db37acf07a 100644
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@@ -60,15 +60,15 @@
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
-  TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
+  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
   TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \
   TENSOR1##_size = TENSOR1->size(DIMENSION); \
 \
-  TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
+  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
   TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \
   TENSOR2##_size = TENSOR2->size(DIMENSION); \
 \
-  TENSOR3##_data = (TENSOR3)->storage->data<TYPE3>()+(TENSOR3)->storageOffset; \
+  TENSOR3##_data = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+(TENSOR3)->storage_offset(); \
   TENSOR3##_stride = (TENSOR3)->stride(DIMENSION); \
   TENSOR3##_size = TENSOR3->size(DIMENSION); \
 \
@@ -167,11 +167,11 @@
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
-  TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
+  TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
   TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \
   TENSOR1##_size = TENSOR1->size(DIMENSION); \
 \
-  TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
+  TENSOR2##_data = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+(TENSOR2)->storage_offset(); \
   TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \
   TENSOR2##_size = TENSOR2->size(DIMENSION); \
 \
@@ -269,7 +269,7 @@
   if( (DIMENSION < 0) || (DIMENSION >= TENSOR->_dim()) ) \
     THError("invalid dimension"); \
 \
-  TENSOR##_data = (TENSOR)->storage->data<TYPE>()+(TENSOR)->storageOffset; \
+  TENSOR##_data = THTensor_getStoragePtr(TENSOR)->data<TYPE>()+(TENSOR)->storage_offset(); \
   TENSOR##_stride = (TENSOR)->stride(DIMENSION); \
   TENSOR##_size = TENSOR->size(DIMENSION); \
   /* Counter stores the indices into the Tensor at any time */ \
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 92314de69bae2..c281e916c58f0 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -7,12 +7,12 @@
 /**** access methods ****/
 THStorage *THTensor_(storage)(const THTensor *self)
 {
-  return self->storage;
+  return THTensor_getStoragePtr(self);
 }
 
 ptrdiff_t THTensor_(storageOffset)(const THTensor *self)
 {
-  return self->storageOffset;
+  return self->storage_offset();
 }
 
 int THTensor_(nDimension)(const THTensor *self)
@@ -53,12 +53,8 @@ THLongStorage *THTensor_(newStrideOf)(THTensor *self)
   return stride;
 }
 
-real *THTensor_(data)(const THTensor *self)
-{
-  if(self->storage)
-    return (THStorage_(data)(self->storage)+self->storageOffset);
-  else
-    return NULL;
+real *THTensor_(data)(const THTensor *self) {
+  return self->data<real>();
 }
 
 /**** creation methods ****/
@@ -74,8 +70,8 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
   THTensor *self = new THTensor(THStorage_(new)());
   THTensor_(setStorageNd)(self,
-                          tensor->storage,
-                          tensor->storageOffset,
+                          THTensor_getStoragePtr(tensor),
+                          tensor->storage_offset(),
                           tensor->dim(),
                           THTensor_getSizePtr(tensor),
                           THTensor_getStridePtr(tensor));
@@ -238,7 +234,7 @@ THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
   auto stride_value = *stride;
   THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size());
   THLongStorage_rawCopy(new_stride, stride_value.data());
-  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, new_stride);
+  THTensor_(setStorage)(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), inferred_size, new_stride);
   THLongStorage_free(inferred_size);
   THLongStorage_free(new_stride);
   return self;
@@ -297,8 +293,8 @@ void THTensor_(set)(THTensor *self, THTensor *src)
 {
   if(self != src)
     THTensor_(setStorageNd)(self,
-                            src->storage,
-                            src->storageOffset,
+                            THTensor_getStoragePtr(src),
+                            src->storage_offset(),
                             src->dim(),
                             THTensor_getSizePtr(src),
                             THTensor_getStridePtr(src));
@@ -386,8 +382,9 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir
 
   THTensor_(set)(self, src);
 
-  if(firstIndex > 0)
-    self->storageOffset += firstIndex*self->stride(dimension);
+  if (firstIndex > 0) {
+    THTensor_setStorageOffset(self, self->storage_offset() + firstIndex*self->stride(dimension));
+  }
 
   THTensor_setSizeAtDim(self, dimension, size);
 }
@@ -640,10 +637,10 @@ int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
 
 int THTensor_(isSetTo)(const THTensor *self, const THTensor* src)
 {
-  if (!self->storage)
+  if (!THTensor_getStoragePtr(self))
     return 0;
-  if (self->storage == src->storage &&
-      self->storageOffset == src->storageOffset &&
+  if (THTensor_getStoragePtr(self) == THTensor_getStoragePtr(src) &&
+      self->storage_offset() == src->storage_offset() &&
       self->_dim() == src->_dim())
   {
     int d;
@@ -673,7 +670,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self)
 
 void THTensor_(retain)(THTensor *self)
 {
-  ++self->refcount;
+  self->retain();
 }
 
 void THTensor_(free)(THTensor *self)
@@ -694,24 +691,24 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
 void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
   /* storage */
-  if(self->storage != storage)
+  if(THTensor_getStoragePtr(self) != storage)
   {
-    if(self->storage)
-      THStorage_(free)(self->storage);
+    if(THTensor_getStoragePtr(self))
+      THStorage_(free)(THTensor_getStoragePtr(self));
 
     if(storage)
     {
-      self->storage = storage;
-      THStorage_(retain)(self->storage);
+      THTensor_stealAndSetStoragePtr(self, storage);
+      THStorage_(retain)(THTensor_getStoragePtr(self));
     }
     else
-      self->storage = THStorage_(new)();
+      THTensor_stealAndSetStoragePtr(self, THStorage_(new)());
   }
 
   /* storageOffset */
   if(storageOffset < 0)
     THError("Tensor: invalid storage offset");
-  self->storageOffset = storageOffset;
+  THTensor_setStorageOffset(self, storageOffset);
 
   /* size and stride */
   THTensor_(resizeNd)(self, nDimension, size, stride);
@@ -780,13 +777,13 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
     totalSize += (self->size(d)-1)*self->stride(d);
   }
 
-  if(totalSize+self->storageOffset > 0)
+  if(totalSize+self->storage_offset() > 0)
   {
-    if(!self->storage) {
-      self->storage = THStorage_(new)();
+    if(!THTensor_getStoragePtr(self)) {
+      THTensor_stealAndSetStoragePtr(self, THStorage_(new)());
     }
-    if(totalSize+self->storageOffset > self->storage->size) {
-      THStorage_(resize)(self->storage, totalSize+self->storageOffset);
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size) {
+      THStorage_(resize)(THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
 }
@@ -795,56 +792,56 @@ void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
 {
   THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value);
+  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
   THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0));
+  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
 
 void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value)
 {
   THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value);
+  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value);
 }
 
 real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
 {
   THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1));
+  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1));
 }
 
 void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
 {
   THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
+  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
 }
 
 real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
   THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
+  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
 }
 
 void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
 {
   THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
+  THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
 }
 
 real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
   THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
+  return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 
 THDescBuff THTensor_(desc)(const THTensor *tensor) {
diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp
index fa989ddafaf40..5ad3e13b237b8 100644
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@@ -3,43 +3,43 @@
 #else
 
 static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)];
+  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)];
 }
 
 static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)];
+  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)];
 }
 
 static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)];
+  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)];
 }
 
 static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)];
+  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)];
 }
 
 static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)];
+  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)];
 }
 
 static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)] = value;
+  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)] = value;
 }
 
 static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)] = value;
+  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)] = value;
 }
 
 static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value;
+  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value;
 }
 
 static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value;
+  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value;
 }
 
 static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value;
+  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value;
 }
 
 #endif
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
index 4793dec43de2a..1556296262303 100644
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -118,7 +118,7 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
       "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
             b->stride(0), 1, 0);
     free_b = 1;
   }
@@ -171,7 +171,7 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
       "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
             b->stride(0), 1, 0);
     free_b = 1;
   }
@@ -221,7 +221,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
       "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
             b->stride(0), 1, 0);
     free_b = 1;
   }
@@ -644,7 +644,7 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
       "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+    b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
             b->stride(0), 1, 0);
     free_b = 1;
   }
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index e4152432a3068..1ea91f7089949 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -3674,12 +3674,12 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
   // Second path for non-contiguous
   int64_t offset;
   if (dimension == 0 && allContiguous) {
-    real* result_data = THStorage_(data)(result->storage) + result->storageOffset;
+    real* result_data = THStorage_(data)(THTensor_getStoragePtr(result)) + result->storage_offset();
     offset = 0;
     for (int j = 0; j < numInputs; j++) {
       if (!should_skip(inputs[j])) {
         THTensor* input0 = inputs[j];
-        real* input0_data = THStorage_(data)(input0->storage) + input0->storageOffset;
+        real* input0_data = THStorage_(data)(THTensor_getStoragePtr(input0)) + input0->storage_offset();
         int64_t input0_size = THTensor_(nElement)(input0);
         // C standard says you can't pass nullptrs to memcpy, even if the size is 0; ubsan checks this.
         if (input0_size != 0) {
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index ceb927429573f..d4dc8fb42b344 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -194,7 +194,7 @@ void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, dou
   std::lock_guard<std::mutex> lock(_generator->mutex);
   const int64_t size = THTensor_(numel)(self);
   if (size >= 16 && THTensor_(isContiguous)(self)) {
-    THVector_(normal_fill)(THStorage_(data)(self->storage), size, _generator, mean, stddev);
+    THVector_(normal_fill)(THStorage_(data)(THTensor_getStoragePtr(self)), size, _generator, mean, stddev);
   } else {
     TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stddev););
   }
@@ -398,8 +398,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
     for (j=0; j<n_categories; j++)
     {
       val = THStorage_(get)( \
-        prob_dist->storage, \
-        prob_dist->storageOffset+i*prob_dist->stride(0)+j*prob_dist->stride(1) \
+        THTensor_getStoragePtr(prob_dist), \
+        prob_dist->storage_offset()+i*prob_dist->stride(0)+j*prob_dist->stride(1) \
       );
       THArgCheckWithCleanup((val >= 0),
                             THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
@@ -411,8 +411,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
                             "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
       sum += val;
       THDoubleStorage_set(
-        cum_dist->storage, \
-        cum_dist->storageOffset+j*cum_dist->stride(0), \
+        THTensor_getStoragePtr(cum_dist), \
+        cum_dist->storage_offset()+j*cum_dist->stride(0), \
         sum \
       );
     }
@@ -448,8 +448,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
       {
           mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
           cum_prob = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+mid_pointer*cum_dist->stride(0) \
+            THTensor_getStoragePtr(cum_dist), \
+            cum_dist->storage_offset()+mid_pointer*cum_dist->stride(0) \
           );
           if (cum_prob < uniform_sample)
           {
@@ -464,8 +464,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
 
        /* store in result tensor (will be incremented for lua compat by wrapper) */
       THLongStorage_set( \
-        self->storage, \
-        self->storageOffset+i*self->stride(0)+j*self->stride(1), \
+        THTensor_getStoragePtr(self), \
+        self->storage_offset()+i*self->stride(0)+j*self->stride(1), \
         sample_idx \
       );
 
@@ -480,22 +480,22 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
         if (sample_idx != 0)
         {
           new_val = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride(0) \
+            THTensor_getStoragePtr(cum_dist), \
+            cum_dist->storage_offset()+(sample_idx-1)*cum_dist->stride(0) \
           );
         }
         /* marginal cumulative mass (i.e. original probability) of sample */
         diff = THDoubleStorage_get( \
-          cum_dist->storage, \
-          cum_dist->storageOffset+sample_idx*cum_dist->stride(0) \
+          THTensor_getStoragePtr(cum_dist), \
+          cum_dist->storage_offset()+sample_idx*cum_dist->stride(0) \
         ) - new_val;
         /* new sum of marginals is not one anymore... */
         sum = 1.0 - diff;
         for (k=0; k<n_categories; k++)
         {
           new_val = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride(0) \
+            THTensor_getStoragePtr(cum_dist), \
+            cum_dist->storage_offset()+k*cum_dist->stride(0) \
           );
           if (k >= sample_idx)
           {
@@ -505,8 +505,8 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
           /* make total marginals sum to one */
           new_val /= sum;
           THDoubleStorage_set( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride(0), \
+            THTensor_getStoragePtr(cum_dist), \
+            cum_dist->storage_offset()+k*cum_dist->stride(0), \
             new_val \
           );
         }
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 13fdff6b3b566..6a599b3b655fd 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -148,13 +148,13 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_
     totalSize += (self->size(d)-1)*self->stride(d);
   }
 
-  if(totalSize+self->storageOffset > 0)
+  if(totalSize+self->storage_offset() > 0)
   {
-    if(!self->storage) {
+    if(!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    if(totalSize+self->storageOffset > self->storage->size) {
-      THCStorage_resize(state, self->storage, totalSize+self->storageOffset);
+    if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size) {
+      THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset());
     }
   }
 }
@@ -164,8 +164,8 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src)
   if(self != src)
     THCTensor_setStorageNd(state,
                            self,
-                           src->storage,
-                           src->storageOffset,
+                           THTensor_getStoragePtr(src),
+                           src->storage_offset(),
                            src->dim(),
                            THTensor_getSizePtr(src),
                            THTensor_getStridePtr(src));
@@ -174,27 +174,27 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src)
 void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
   /* storage */
-  if(self->storage != storage)
+  if(THTensor_getStoragePtr(self) != storage)
   {
-    if (!self->storage) {
+    if (!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    auto scalar_type = self->storage->scalar_type;
-    THStorage_free(self->storage);
+    auto scalar_type = THTensor_getStoragePtr(self)->scalar_type;
+    THStorage_free(THTensor_getStoragePtr(self));
 
-    if(storage)
-    {
-      self->storage = storage;
-      THStorage_retain(self->storage);
+    if (storage) {
+      THTensor_stealAndSetStoragePtr(self, storage);
+      THStorage_retain(THTensor_getStoragePtr(self));
+    } else {
+      THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type));
     }
-    else
-      self->storage = THCStorage_new(state, scalar_type);
   }
 
   /* storageOffset */
-  if(storageOffset < 0)
+  if (storageOffset < 0) {
     THError("Tensor: invalid storage offset");
-  self->storageOffset = storageOffset;
+  }
+  THTensor_setStorageOffset(self, storageOffset);
 
   /* size and stride */
   THCTensor_resizeNd(state, self, nDimension, size, stride);
@@ -294,17 +294,16 @@ ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) {
 }
 
 void THCTensor_retain(THCState *state, THCTensor *self) {
-  self->refcount++;
+  self->retain();
 }
 
-
 void THCTensor_free(THCState *state, THCTensor *self) {
   THTensor_free(self);
 }
 
 int THCTensor_getDevice(THCState* state, const THCTensor* tensor) {
-  if (!tensor->storage) return -1;
-  return THCStorage_getDevice(state, tensor->storage);
+  if (!THTensor_getStoragePtr(tensor)) return -1;
+  return THCStorage_getDevice(state, THTensor_getStoragePtr(tensor));
 }
 
 bool THCTensor_allSameDevice(THCState* state, THCTensor ** inputs, int numInputs) {
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index 3b03e37232ef0..023a55d176be8 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -5,12 +5,12 @@
 /**** access methods ****/
 THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self)
 {
-  return self->storage;
+  return THTensor_getStoragePtr(self);
 }
 
 ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
 {
-  return self->storageOffset;
+  return self->storage_offset();
 }
 
 int THCTensor_(nDimension)(THCState *state, const THCTensor *self)
@@ -47,8 +47,8 @@ THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self)
 
 real *THCTensor_(data)(THCState *state, const THCTensor *self)
 {
-  if(self->storage)
-    return (THCStorage_(data)(state, self->storage)+self->storageOffset);
+  if(THTensor_getStoragePtr(self))
+    return (THCStorage_(data)(state, THTensor_getStoragePtr(self))+self->storage_offset());
   else
     return NULL;
 }
@@ -67,8 +67,8 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
   THCTensor *self = new THCTensor(THCStorage_(new)(state));
   THCTensor_(setStorageNd)(state,
                            self,
-                           tensor->storage,
-                           tensor->storageOffset,
+                           THTensor_getStoragePtr(tensor),
+                           tensor->storage_offset(),
                            tensor->dim(),
                            THTensor_getSizePtr(tensor),
                            THTensor_getStridePtr(tensor));
@@ -227,7 +227,7 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage
   auto stride_value = *stride;
   THLongStorage *new_stride = THLongStorage_newWithSize(stride_value.size());
   THLongStorage_rawCopy(new_stride, stride_value.data());
-  THCTensor_(setStorage)(state, self, tensor->storage, tensor->storageOffset, inferred_size, new_stride);
+  THCTensor_(setStorage)(state, self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), inferred_size, new_stride);
   THLongStorage_free(inferred_size);
   THLongStorage_free(new_stride);
   return self;
@@ -376,8 +376,9 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di
 
   THCTensor_(set)(state, self, src);
 
-  if(firstIndex > 0)
-    self->storageOffset += firstIndex*self->stride(dimension);
+  if (firstIndex > 0) {
+    THTensor_setStorageOffset(self, self->storage_offset() + firstIndex*self->stride(dimension));
+  }
 
   THTensor_setSizeAtDim(self, dimension, size);
 }
@@ -537,8 +538,8 @@ int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStora
 
 int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src)
 {
-  if (self->storage == src->storage &&
-      self->storageOffset == src->storageOffset &&
+  if (THTensor_getStoragePtr(self) == THTensor_getStoragePtr(src) &&
+      self->storage_offset() == src->storage_offset() &&
       self->dim() == src->dim())
   {
     int d;
@@ -604,56 +605,56 @@ void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real valu
 {
   THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value);
+  THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0)
 {
   THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0));
+  return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
 
 void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value)
 {
   THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value);
+  THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1), value);
 }
 
 real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1)
 {
   THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1));
+  return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1));
 }
 
 void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
 {
   THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
+  THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
 }
 
 real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
   THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
+  return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
 }
 
 void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
 {
   THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
+  THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
 }
 
 real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
   THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
+  return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 
 int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp
index 5715133a44f6c..f108ca10015c2 100644
--- a/aten/src/THC/generic/THCTensorCopy.cpp
+++ b/aten/src/THC/generic/THCTensorCopy.cpp
@@ -131,7 +131,7 @@ void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor
                               cudaMemcpyHostToDevice,
                               THCStream_stream(stream)));
 
-  THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream));
+  THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(THTensor_getStoragePtr(src)), stream));
 
   if (currentDevice != tensorDevice) {
     THCudaCheck(cudaSetDevice(currentDevice));
@@ -162,7 +162,7 @@ void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor
                               cudaMemcpyDeviceToHost,
                               THCStream_stream(stream)));
 
-  THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, src->storage), stream));
+  THCudaCheck(THCCachingHostAllocator_recordEvent(THCStorage_(data)(state, THTensor_getStoragePtr(src)), stream));
 
   if (currentDevice != tensorDevice) {
     THCudaCheck(cudaSetDevice(currentDevice));
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 0e2630c1edf3b..71bc17ee2f739 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -10,7 +10,7 @@ THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
 
 template <>
 THCTensor *THCTensor_newClone<real>(THCState *state, THCTensor *self) {
-  THCTensor *tensor = THCTensor_new(state, self->storage->scalar_type);
+  THCTensor *tensor = THCTensor_new(state, THTensor_getStoragePtr(self)->scalar_type);
   THCTensor_resizeAs(state, tensor, self);
   THC_copyTensor<real, real>(state, tensor, self);
   return tensor;
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 07033fa0e8f1d..d5077655bfcdf 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -383,8 +383,8 @@ void THCTensor_(eye)(THCState *state, THCTensor *self_, int64_t n, int64_t m)
   int64_t stride = THCTensor_(stride)(state, self_, 0) +
                    THCTensor_(stride)(state, self_, 1);
 
-  THCTensor *diag = THCTensor_(newWithStorage1d)(state, self_->storage,
-      self_->storageOffset,  sz, stride);
+  THCTensor *diag = THCTensor_(newWithStorage1d)(state, THTensor_getStoragePtr(self_),
+      self_->storage_offset(),  sz, stride);
 
   THCTensor_(fill)(state, diag, ScalarConvert<int, real>::to(1));
   THCTensor_(free)(state, diag);
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index c942a17c4c0df..dd2b2777d3552 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -12,7 +12,7 @@ static void THCTensor_(copyArray1d)(THCState *state, THCTensor *self, real *src,
   int64_t stride[1] = { 1 };
   THCTensor_(resizeNd)(state, self, 1, size, stride);
   size_t len = k * sizeof(real);
-  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(self)) + self->storage_offset(), src, len, cudaMemcpyHostToDevice));
 }
 
 static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src, int m, int n)
@@ -21,7 +21,7 @@ static void THCTensor_(copyArray2d)(THCState *state, THCTensor *self, real *src,
   int64_t stride[2] = { 1, m };
   THCTensor_(resizeNd)(state, self, 2, size, stride);
   size_t len = m * n * sizeof(real);
-  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, self->storage) + self->storageOffset, src, len, cudaMemcpyHostToDevice));
+  THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(self)) + self->storage_offset(), src, len, cudaMemcpyHostToDevice));
 }
 
 static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self)
@@ -30,7 +30,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self
   size_t len = THCTensor_(nElement)(state, self)*sizeof(real);
   THCTensor *temp = THCTensor_(newTranspose)(state, self, 0, 1);
   THCTensor *selfc = THCTensor_(newContiguous)(state, temp);
-  THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, selfc->storage) + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
+  THCudaCheck(cudaMemcpy(dst, THCStorage_(data)(state, THTensor_getStoragePtr(selfc)) + selfc->storage_offset(), len, cudaMemcpyDeviceToHost));
   THCTensor_(free)(state, temp);
   THCTensor_(free)(state, selfc);
 }
@@ -294,8 +294,8 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
     THCTensor_(resize2d)(state, re_, 2, n);
     THCTensor *re = THCTensor_(newContiguous)(state, re_);
     if (n > 0) {
-      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset, wr, n*sizeof(real), cudaMemcpyHostToDevice));
-      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, re->storage) + re->storageOffset + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
+      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(re)) + re->storage_offset(), wr, n*sizeof(real), cudaMemcpyHostToDevice));
+      THCudaCheck(cudaMemcpy(THCStorage_(data)(state, THTensor_getStoragePtr(re)) + re->storage_offset() + n, wi, n*sizeof(real), cudaMemcpyHostToDevice));
     }
     THCTensor_(freeCopyTo)(state, re, re_);
     THCTensor_(transpose)(state, re_, NULL, 0, 1);
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
index f22aba639a2d6..8c1c15ddc0abf 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -61,8 +61,8 @@ static THCTensor* THNN_(view_weight_local)(
     int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5);
     THCTensor *old_weight = weight;
     weight = THCTensor_(newWithStorage3d)(state,
-                          weight->storage,
-                          weight->storageOffset,
+                          THTensor_getStoragePtr(weight),
+                          weight->storage_offset(),
                           s1, -1, s2, -1, s3, -1);
     THCTensor_(free)(state, old_weight);
   }
@@ -140,12 +140,12 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
       1, 1, THCTensor_(data)(state, finput_n)
     );
 
-    output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset,
+    output3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(output_n), output_n->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              nOutputPlane, outputHeight*outputWidth,
                                              1, nOutputPlane*outputHeight*outputWidth);
 
-    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+    finput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(finput_n), finput_n->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              kW*kH*nInputPlane, outputHeight*outputWidth,
                                              1, kW*kH*nInputPlane*outputHeight*outputWidth);
@@ -247,11 +247,11 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
     THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt);
     THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
 
-    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(),
                                                outputHeight*outputWidth, 1,
                                                nOutputPlane, outputHeight*outputWidth,
                                                1, nOutputPlane*outputHeight*outputWidth);
-    fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset,
+    fgradInput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(fgradInput_n), fgradInput_n->storage_offset(),
                                                outputHeight*outputWidth, 1,
                                                kW*kH*nInputPlane, outputHeight*outputWidth,
                                                1, kW*kH*nInputPlane*outputHeight*outputWidth);
@@ -358,11 +358,11 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
     THCTensor_(select)(state, finput_n, finput, 0, elt);
     THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
 
-    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(),
                                                  outputHeight*outputWidth, 1,
                                                  nOutputPlane, outputHeight*outputWidth,
                                                  1, nOutputPlane*outputHeight*outputWidth);
-    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+    finput3d = THCTensor_(newWithStorage3d)(state, THTensor_getStoragePtr(finput_n), finput_n->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              1, kW*kH*nInputPlane*outputHeight*outputWidth,
                                              kW*kH*nInputPlane, outputHeight*outputWidth);
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index e276d349d648c..fb695eb002020 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -109,7 +109,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   if (weight->dim() == 4) {
     int64_t s1 = weight->size(0);
     int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
-    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1);
     freeWeight = 1;
   }
 
@@ -264,7 +264,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   if (weight->dim() == 4) {
     int64_t s1 = weight->size(0);
     int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
-    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(), s1, -1, s2, -1);
     freeWeight = 1;
   }
 
@@ -398,7 +398,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
   if (gradWeight && gradWeight->dim() == 4) {
     int64_t s1 = gradWeight->size(0);
     int64_t s2 = gradWeight->size(1) * gradWeight->size(2) * gradWeight->size(3);
-    gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+    gradWeight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(gradWeight), gradWeight->storage_offset(), s1, -1, s2, -1);
     freeWeight = 1;
   }
 
diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu
index 25baf933b57a9..886cf07b4e75e 100644
--- a/aten/src/THCUNN/generic/TemporalConvolution.cu
+++ b/aten/src/THCUNN/generic/TemporalConvolution.cu
@@ -90,13 +90,13 @@ void THNN_(TemporalConvolution_updateOutput)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size(1),
+      THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(input),
+                              input->storage_offset()+k*dW*input->size(1),
                               nFrame, inputFrameStride*input->size(1),
                               kW*input->size(1), 1);
 
-      THCTensor_(setStorage2d)(state, outputWindow, output->storage,
-                              output->storageOffset + k*output->size(1),
+      THCTensor_(setStorage2d)(state, outputWindow, THTensor_getStoragePtr(output),
+                              output->storage_offset() + k*output->size(1),
                               nFrame, outputFrameStride*output->size(1),
                               output->size(1), 1);
 
@@ -138,13 +138,13 @@ void THNN_(TemporalConvolution_updateOutput)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size(1),
+        THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(inputSample),
+                                inputSample->storage_offset()+k*dW*inputSample->size(1),
                                 nFrame, inputFrameStride*inputSample->size(1),
                                 kW*inputSample->size(1), 1);
 
-        THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage,
-                                outputSample->storageOffset + k*outputSample->size(1),
+        THCTensor_(setStorage2d)(state, outputWindow, THTensor_getStoragePtr(outputSample),
+                                outputSample->storage_offset() + k*outputSample->size(1),
                                 nFrame, outputFrameStride*outputSample->size(1),
                                 outputSample->size(1), 1);
 
@@ -215,13 +215,13 @@ void THNN_(TemporalConvolution_updateGradInput)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size(1),
+      THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutput),
+                              gradOutput->storage_offset() + k*gradOutput->size(1),
                               nFrame, outputFrameStride*gradOutput->size(1),
                               gradOutput->size(1), 1);
 
-      THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage,
-                              gradInput->storageOffset+k*dW*gradInput->size(1),
+      THCTensor_(setStorage2d)(state, gradInputWindow, THTensor_getStoragePtr(gradInput),
+                              gradInput->storage_offset()+k*dW*gradInput->size(1),
                               nFrame, inputFrameStride*gradInput->size(1),
                               kW*gradInput->size(1), 1);
 
@@ -247,13 +247,13 @@ void THNN_(TemporalConvolution_updateGradInput)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+        THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutputSample),
+                                gradOutputSample->storage_offset() + k*gradOutputSample->size(1),
                                 nFrame, outputFrameStride*gradOutputSample->size(1),
                                 gradOutputSample->size(1), 1);
 
-        THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage,
-                                gradInputSample->storageOffset+k*dW*gradInputSample->size(1),
+        THCTensor_(setStorage2d)(state, gradInputWindow, THTensor_getStoragePtr(gradInputSample),
+                                gradInputSample->storage_offset()+k*dW*gradInputSample->size(1),
                                 nFrame, inputFrameStride*gradInputSample->size(1),
                                 kW*gradInputSample->size(1), 1);
 
@@ -324,13 +324,13 @@ void THNN_(TemporalConvolution_accGradParameters)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size(1),
+      THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(input),
+                              input->storage_offset()+k*dW*input->size(1),
                               nFrame, inputFrameStride*input->size(1),
                               kW*input->size(1), 1);
 
-      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size(1),
+      THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutput),
+                              gradOutput->storage_offset() + k*gradOutput->size(1),
                               nFrame, outputFrameStride*gradOutput->size(1),
                               gradOutput->size(1), 1);
 
@@ -367,13 +367,13 @@ void THNN_(TemporalConvolution_accGradParameters)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size(1),
+        THCTensor_(setStorage2d)(state, inputWindow, THTensor_getStoragePtr(inputSample),
+                                inputSample->storage_offset()+k*dW*inputSample->size(1),
                                 nFrame, inputFrameStride*inputSample->size(1),
                                 kW*inputSample->size(1), 1);
 
-        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+        THCTensor_(setStorage2d)(state, gradOutputWindow, THTensor_getStoragePtr(gradOutputSample),
+                                gradOutputSample->storage_offset() + k*gradOutputSample->size(1),
                                 nFrame, outputFrameStride*gradOutputSample->size(1),
                                 gradOutputSample->size(1), 1);
 
diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
index 83a32ca2a063d..208016e93fe96 100644
--- a/aten/src/THCUNN/generic/TemporalRowConvolution.cu
+++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
@@ -151,7 +151,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
             THCTensor_(data)(state, columns));
 
     THCTensor *output3d = THCTensor_(newWithStorage3d)(
-        state, output_n->storage, output_n->storageOffset, inputFrameSize, -1,
+        state, THTensor_getStoragePtr(output_n), output_n->storage_offset(), inputFrameSize, -1,
         1, -1, nOutputFrame, -1);
 
     // weight:    inputFrameSize x 1 x kW
@@ -251,7 +251,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
     THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
 
     THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)(
-        state, gradOutput_n->storage, gradOutput_n->storageOffset,
+        state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(),
         inputFrameSize, -1, 1, -1, nOutputFrame, -1);
 
     // weight:          inputFrameSize x kW x 1
@@ -365,7 +365,7 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
     THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
 
     THCTensor *gradOutput3d = THCTensor_(newWithStorage3d)(
-        state, gradOutput_n->storage, gradOutput_n->storageOffset,
+        state, THTensor_getStoragePtr(gradOutput_n), gradOutput_n->storage_offset(),
         inputFrameSize, -1, 1, -1, nOutputFrame, -1);
 
     // Extract columns
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
index c7a00e335dbc6..a28d4e78477ce 100644
--- a/aten/src/THNN/generic/SparseLinear.c
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -30,14 +30,14 @@ static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
 }
 
 static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
-  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride(0), value);
+  THStorage_(set)(THTensor_getStoragePtr(t), t->storage_offset() + x0*t->stride(0), value);
 }
 static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) {
-  return THStorage_(get)(t->storage, t->storageOffset +
+  return THStorage_(get)(THTensor_getStoragePtr(t), t->storage_offset() +
                          x0*t->stride(0) + x1*t->stride(1) + x2*t->stride(2));
 }
 static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) {
-  return THStorage_(get)(t->storage, t->storageOffset +
+  return THStorage_(get)(THTensor_getStoragePtr(t), t->storage_offset() +
                          x0*t->stride(0) + x1*t->stride(1));
 }
 
diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c
index 6461285ca931f..d56cb198d88bf 100644
--- a/aten/src/THNN/generic/SpatialConvolutionLocal.c
+++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c
@@ -57,8 +57,8 @@ static THTensor* THNN_(view_weight_local)(THTensor *_weight)
     int64_t s2 = weight->size(2);
     int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5);
     THTensor *old_weight = weight;
-    weight = THTensor_(newWithStorage3d)(weight->storage,
-                       weight->storageOffset,
+    weight = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(weight),
+                       weight->storage_offset(),
                        s1, -1, s2, -1, s3, -1);
     THTensor_(free)(old_weight);
   }
@@ -82,13 +82,13 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
   THTensor_(copy)(output, bias);
 
   output3d = THTensor_(newWithStorage3d)
-    (output->storage, output->storageOffset,
+    (THTensor_getStoragePtr(output), output->storage_offset(),
      outputHeight * outputWidth, 1,
      nOutputPlane, outputHeight * outputWidth,
      1, nOutputPlane * outputHeight * outputWidth);
 
   finput3d = THTensor_(newWithStorage3d)
-    (finput->storage, finput->storageOffset,
+    (THTensor_getStoragePtr(finput), finput->storage_offset(),
      outputHeight * outputWidth, 1,
      kW * kH * nInputPlane, outputHeight * outputWidth,
      1, kW * kH * nInputPlane * outputHeight * outputWidth);
@@ -178,11 +178,11 @@ static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
       int64_t nOutputPlane, int64_t outputWidth, int64_t outputHeight)
 {
   THTensor *gradOutput3d, *fgradInput3d;
-  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+  gradOutput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              nOutputPlane, outputHeight*outputWidth,
                                              1, nOutputPlane*outputHeight*outputWidth);
-  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+  fgradInput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(fgradInput), fgradInput->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              kW*kH*nInputPlane, outputHeight*outputWidth,
                                              1, kW*kH*nInputPlane*outputHeight*outputWidth);
@@ -280,11 +280,11 @@ static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
 {
 
   THTensor *gradOutput3d, *finput3d;
-  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+  gradOutput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
                                              outputHeight*outputWidth, 1,
                                              nOutputPlane, outputHeight*outputWidth,
                                              1, nOutputPlane*outputHeight*outputWidth);
-  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+  finput3d = THTensor_(newWithStorage3d)(THTensor_getStoragePtr(finput), finput->storage_offset(),
                                          outputHeight*outputWidth, 1,
                                          1, kW*kH*nInputPlane*outputHeight*outputWidth,
                                          kW*kH*nInputPlane, outputHeight*outputWidth);
diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c
index 434320a3cfdfd..7547dbf12e059 100644
--- a/aten/src/THNN/generic/SpatialConvolutionMM.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@@ -84,7 +84,7 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
     int64_t s1 = weight->size(0);
     int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
     THTensor *old_weight = weight;
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+    weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
 					 s1, -1, s2, -1);
 	THTensor_(free)(old_weight);
   }
@@ -117,13 +117,13 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
 		       nInputPlane, inputWidth, inputHeight,
 		       outputWidth, outputHeight);
 
-  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+  output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
                                          nOutputPlane, -1,
                                          outputHeight*outputWidth, -1);
   if (bias) {
     for(i = 0; i < nOutputPlane; i++)
         THVector_(fill)
-	  (THStorage_(data)(output->storage) + output->storageOffset + output->stride(0) * i,
+	  (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
 	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
   } else {
     THTensor_(zero)(output);
@@ -228,7 +228,7 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
           int padH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)
-    (gradOutput->storage, gradOutput->storageOffset,
+    (THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
      gradOutput->size(0), -1,
      gradOutput->size(1)*gradOutput->size(2), -1);
   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
@@ -318,7 +318,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
 {
   int64_t i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)
-    (gradOutput->storage, gradOutput->storageOffset,
+    (THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
      gradOutput->size(0), -1,
      gradOutput->size(1)*gradOutput->size(2), -1);
 
@@ -334,10 +334,10 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
     {
       int64_t k;
       real sum = 0;
-      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0);
+      real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput2d)) + gradOutput2d->storage_offset() + i*gradOutput2d->stride(0);
       for(k = 0; k < gradOutput2d->size(1); k++)
         sum += data[k];
-      (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum;
+      (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i] += scale*sum;
     }
   }
 
diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c
index 2c3e1da84de5e..00ddcb226fe26 100644
--- a/aten/src/THNN/generic/TemporalConvolution.c
+++ b/aten/src/THNN/generic/TemporalConvolution.c
@@ -88,13 +88,13 @@ void THNN_(TemporalConvolution_updateOutput)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THTensor_(setStorage2d)(inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size(1),
+      THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(input),
+                              input->storage_offset()+k*dW*input->size(1),
                               nFrame, inputFrameStride*input->size(1),
                               kW*input->size(1), 1);
 
-      THTensor_(setStorage2d)(outputWindow, output->storage,
-                              output->storageOffset + k*output->size(1),
+      THTensor_(setStorage2d)(outputWindow, THTensor_getStoragePtr(output),
+                              output->storage_offset() + k*output->size(1),
                               nFrame, outputFrameStride*output->size(1),
                               output->size(1), 1);
 
@@ -136,13 +136,13 @@ void THNN_(TemporalConvolution_updateOutput)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size(1),
+        THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(inputSample),
+                                inputSample->storage_offset()+k*dW*inputSample->size(1),
                                 nFrame, inputFrameStride*inputSample->size(1),
                                 kW*inputSample->size(1), 1);
 
-        THTensor_(setStorage2d)(outputWindow, outputSample->storage,
-                                outputSample->storageOffset + k*outputSample->size(1),
+        THTensor_(setStorage2d)(outputWindow, THTensor_getStoragePtr(outputSample),
+                                outputSample->storage_offset() + k*outputSample->size(1),
                                 nFrame, outputFrameStride*outputSample->size(1),
                                 outputSample->size(1), 1);
 
@@ -210,13 +210,13 @@ void THNN_(TemporalConvolution_updateGradInput)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size(1),
+      THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutput),
+                              gradOutput->storage_offset() + k*gradOutput->size(1),
                               nFrame, outputFrameStride*gradOutput->size(1),
                               gradOutput->size(1), 1);
 
-      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
-                              gradInput->storageOffset+k*dW*gradInput->size(1),
+      THTensor_(setStorage2d)(gradInputWindow, THTensor_getStoragePtr(gradInput),
+                              gradInput->storage_offset()+k*dW*gradInput->size(1),
                               nFrame, inputFrameStride*gradInput->size(1),
                               kW*gradInput->size(1), 1);
 
@@ -243,13 +243,13 @@ void THNN_(TemporalConvolution_updateGradInput)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+        THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutputSample),
+                                gradOutputSample->storage_offset() + k*gradOutputSample->size(1),
                                 nFrame, outputFrameStride*gradOutputSample->size(1),
                                 gradOutputSample->size(1), 1);
 
-        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
-                                gradInputSample->storageOffset+k*dW*gradInputSample->size(1),
+        THTensor_(setStorage2d)(gradInputWindow, THTensor_getStoragePtr(gradInputSample),
+                                gradInputSample->storage_offset()+k*dW*gradInputSample->size(1),
                                 nFrame, inputFrameStride*gradInputSample->size(1),
                                 kW*gradInputSample->size(1), 1);
 
@@ -319,13 +319,13 @@ void THNN_(TemporalConvolution_accGradParameters)(
       int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
       nOutputFrame -= nFrame;
 
-      THTensor_(setStorage2d)(inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size(1),
+      THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(input),
+                              input->storage_offset()+k*dW*input->size(1),
                               nFrame, inputFrameStride*input->size(1),
                               kW*input->size(1), 1);
 
-      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size(1),
+      THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutput),
+                              gradOutput->storage_offset() + k*gradOutput->size(1),
                               nFrame, outputFrameStride*gradOutput->size(1),
                               gradOutput->size(1), 1);
 
@@ -362,13 +362,13 @@ void THNN_(TemporalConvolution_accGradParameters)(
         int64_t nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
         nOutputSampleFrame -= nFrame;
 
-        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size(1),
+        THTensor_(setStorage2d)(inputWindow, THTensor_getStoragePtr(inputSample),
+                                inputSample->storage_offset()+k*dW*inputSample->size(1),
                                 nFrame, inputFrameStride*inputSample->size(1),
                                 kW*inputSample->size(1), 1);
 
-        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+        THTensor_(setStorage2d)(gradOutputWindow, THTensor_getStoragePtr(gradOutputSample),
+                                gradOutputSample->storage_offset() + k*gradOutputSample->size(1),
                                 nFrame, outputFrameStride*gradOutputSample->size(1),
                                 gradOutputSample->size(1), 1);
 
diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c
index 5a85065058670..b623e5a2ad7fd 100644
--- a/aten/src/THNN/generic/TemporalRowConvolution.c
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@@ -148,7 +148,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)(
 	int64_t i;
 
 	THTensor *output3d = THTensor_(newWithStorage3d)(
-		output->storage, output->storageOffset,
+		THTensor_getStoragePtr(output), output->storage_offset(),
 		inputFrameSize, -1,
 		1, -1,
 		nOutputFrame, -1);
@@ -161,7 +161,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)(
 	if (bias != NULL) {
 		for (i = 0; i < inputFrameSize; i++)
 			THVector_(fill)
-			        (THStorage_(data)(output->storage) + output->storageOffset
+			        (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
 			        + output->stride(0) * i,
 			        THTensor_(get1d)(bias, i), nOutputFrame);
 	}
@@ -261,7 +261,7 @@ static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
 	int64_t nOutputFrame) {
 
 	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
-		gradOutput->storage, gradOutput->storageOffset,
+		THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
 		inputFrameSize, -1,
 		1, -1,
 		nOutputFrame, -1);
@@ -372,7 +372,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
 
 	int64_t i;
 	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
-		gradOutput->storage, gradOutput->storageOffset,
+		THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
 		gradOutput->size(0), -1,
 		1, -1,
 		gradOutput->size(1), -1);
@@ -389,13 +389,13 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
 		for (i = 0; i < gradBias->size(0); i++) {
 			int64_t k;
 			real sum = 0;
-			real *data = THStorage_(data)(gradOutput3d->storage)
-			             + gradOutput3d->storageOffset
+			real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
+			             + gradOutput3d->storage_offset()
 			             + i * gradOutput3d->stride(0);
 			for (k = 0; k < gradOutput3d->size(2); k++) {
 				sum += data[k];
 			}
-			(THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i]
+			(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
 			        += scale * sum;
 		}
 	}
diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c
index 525fa5928edc6..9f1c83ce8af3a 100644
--- a/aten/src/THNN/generic/VolumetricConvolutionMM.c
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@@ -116,7 +116,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
     int64_t s1 = weight->size(0);
     int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
     THTensor *old_weight = weight;
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+    weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
 					 s1, -1, s2, -1);
     THTensor_(free)(old_weight);
   }
@@ -427,7 +427,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
   );
 
   output2d = THTensor_(newWithStorage2d)(
-    output->storage, output->storageOffset, nOutputPlane, -1,
+    THTensor_getStoragePtr(output), output->storage_offset(), nOutputPlane, -1,
     outputDepth*outputHeight*outputWidth, -1
   );
 
@@ -435,7 +435,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
       for (i = 0; i < nOutputPlane; i++)
       {
         THVector_(fill)(
-          THStorage_(data)(output->storage)+output->storageOffset+output->stride(0)*i,
+          THStorage_(data)(THTensor_getStoragePtr(output))+output->storage_offset()+output->stride(0)*i,
           THTensor_(get1d)(bias, i),
           outputDepth*outputHeight*outputWidth
         );
@@ -570,7 +570,7 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
           int pH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
-    gradOutput->storage, gradOutput->storageOffset,
+    THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
     gradOutput->size(0), -1,
     gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1
   );
@@ -676,7 +676,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
 {
   int64_t i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
-    gradOutput->storage, gradOutput->storageOffset,
+    THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
     gradOutput->size(0), -1,
     gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1
   );
@@ -693,11 +693,11 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
     {
       int64_t k;
       real sum = 0;
-      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0);
+      real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput2d)) + gradOutput2d->storage_offset() + i*gradOutput2d->stride(0);
       for (k = 0; k < gradOutput2d->size(1); k++)
         sum += data[k];
 
-      (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum;
+      (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i] += scale * sum;
     }
   }
 

From 9ee513365121cd387e11987c66db6599ac53ded7 Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <tongzhou.wang.1994@gmail.com>
Date: Sun, 22 Jul 2018 20:08:56 -0700
Subject: [PATCH 22/23] Fix dataloader hang when it is not completely iterated
 (#9655)

Summary:
second trial of https://github.com/pytorch/pytorch/pull/7140

cc csarofeen Let's see if this works. It passes everything locally.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9655

Differential Revision: D8940177

Pulled By: SsnL

fbshipit-source-id: 8d6340fc9f7355c71e1e26b262da166402faa158
---
 test/test_dataloader.py        | 118 +++++++++++++++++++--------------
 torch/utils/data/dataloader.py |  85 ++++++++++++------------
 2 files changed, 110 insertions(+), 93 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index d1dabd266b878..90f927d9168f1 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -205,9 +205,12 @@ class SleepDataset(Dataset):
     def __init__(self, size, sleep_sec):
         self.size = size
         self.sleep_sec = sleep_sec
+        self.sleeped = False
 
     def __getitem__(self, idx):
-        time.sleep(self.sleep_sec)
+        if not self.sleeped:
+            time.sleep(self.sleep_sec)
+            self.sleeped = True
         return idx
 
     def __len__(self):
@@ -251,7 +254,7 @@ def __len__(self):
 
 
 def _test_timeout():
-    dataset = SleepDataset(10, 10)
+    dataset = SleepDataset(10, 3)
     dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1)
     _ = next(iter(dataloader))
 
@@ -474,36 +477,40 @@ def test_error_workers(self):
     @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_partial_workers(self):
-        "check that workers exit even if the iterator is not exhausted"
-        loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=True))
-        workers = loader.workers
-        worker_manager_thread = loader.worker_manager_thread
-        for i, sample in enumerate(loader):
-            if i == 3:
-                break
-        del loader
-        for w in workers:
-            w.join(JOIN_TIMEOUT)
-            self.assertFalse(w.is_alive(), 'subprocess not terminated')
-            self.assertEqual(w.exitcode, 0)
-        worker_manager_thread.join(JOIN_TIMEOUT)
-        self.assertFalse(worker_manager_thread.is_alive())
+        r"""Check that workers exit even if the iterator is not exhausted."""
+        for pin_memory in (True, False):
+            loader = iter(DataLoader(self.dataset, batch_size=2, num_workers=4, pin_memory=pin_memory))
+            workers = loader.workers
+            if pin_memory:
+                pin_memory_thread = loader.pin_memory_thread
+            for i, sample in enumerate(loader):
+                if i == 10:
+                    break
+            del loader
+            for w in workers:
+                w.join(JOIN_TIMEOUT)
+                self.assertFalse(w.is_alive(), 'subprocess not terminated')
+            if pin_memory:
+                pin_memory_thread.join(JOIN_TIMEOUT)
+                self.assertFalse(pin_memory_thread.is_alive())
 
     @staticmethod
-    def _manager_process(dataset, worker_pids, manager_exit_event):
+    def _main_process(dataset, worker_pids, main_exit_event, raise_error):
         loader = iter(DataLoader(dataset, batch_size=2, num_workers=4, pin_memory=True))
         workers = loader.workers
         for i in range(len(workers)):
             worker_pids[i] = int(workers[i].pid)
         for i, sample in enumerate(loader):
             if i == 3:
-                break
-        # Simulate a dirty exit of the manager process
-        manager_exit_event.set()
-        if IS_WINDOWS:
-            os.system('taskkill /PID ' + str(os.getpid()) + ' /F')
-        else:
-            os.kill(os.getpid(), signal.SIGKILL)
+                # Simulate an exit of the manager process
+                main_exit_event.set()
+                if raise_error:
+                    raise RuntimeError('Error')
+                else:
+                    if IS_WINDOWS:
+                        os.system('taskkill /PID ' + str(os.getpid()) + ' /F')
+                    else:
+                        os.kill(os.getpid(), signal.SIGKILL)
 
     @staticmethod
     def _is_process_alive(pid, pname):
@@ -526,33 +533,44 @@ def _is_process_alive(pid, pname):
                      "spawn start method is not supported in Python 2, \
                      but we need it for creating another process with CUDA")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    def test_manager_unclean_exit(self):
-        '''there might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \
+    def test_main_process_unclean_exit(self):
+        r'''There might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \
 but they are all safe to ignore'''
-        worker_pids = mp.Array('i', [0] * 4)
 
-        manager_exit_event = mp.Event()
-        p = mp.Process(target=TestDataLoader._manager_process,
-                       args=(self.dataset, worker_pids, manager_exit_event))
-        p.start()
-
-        manager_exit_event.wait()
-
-        exit_status = [False] * len(worker_pids)
-        start_time = time.time()
-        pname = 'python'
-        while True:
-            for i in range(len(worker_pids)):
-                pid = worker_pids[i]
-                if not exit_status[i]:
-                    if not TestDataLoader._is_process_alive(pid, pname):
-                        exit_status[i] = True
-            if all(exit_status):
-                break
-            else:
-                time.sleep(1)
-                self.assertFalse(time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT,
-                                 'subprocess not terminated')
+        # `raise_error` controls if the main process is KILL-ed by OS or just
+        # simply raises an error. Both cases are interesting because
+        # 1. In case of it is KILL-ed by OS, the workers need to automatically
+        #    discover that their parent is dead and exit gracefully.
+        # 2. In case of it raises an error itself, the parent process needs to
+        #    take care of exiting the worker and then exits itself gracefully.
+        for raise_error in (True, False):
+            worker_pids = mp.Array('i', [0] * 4)
+
+            main_exit_event = mp.Event()
+            p = mp.Process(target=TestDataLoader._main_process,
+                           args=(self.dataset, worker_pids, main_exit_event, raise_error))
+            p.start()
+            worker_pids[-1] = p.pid
+
+            main_exit_event.wait()
+
+            exit_status = [False] * len(worker_pids)
+            start_time = time.time()
+            pname = 'python'
+            while True:
+                for i in range(len(worker_pids)):
+                    pid = worker_pids[i]
+                    if not exit_status[i]:
+                        if not TestDataLoader._is_process_alive(pid, pname):
+                            exit_status[i] = True
+                if all(exit_status):
+                    break
+                else:
+                    if time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT:
+                        self.fail('subprocess not terminated')
+                    time.sleep(1)
+            p.join(MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT - (time.time() - start_time))
+            self.assertFalse(p.is_alive(), 'main process not terminated')
 
     def test_len(self):
         def check_len(dl, expected):
@@ -596,7 +614,7 @@ def __len__(self):
             self.assertIsInstance(batch, tt)
 
     @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
-    def test_default_colate_bad_numpy_types(self):
+    def test_default_collate_bad_numpy_types(self):
         import numpy as np
 
         # Should be a no-op
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 10457a6653602..60789e9fb6299 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -72,7 +72,7 @@ def is_alive(self):
             return os.getppid() == self.manager_pid
 
 
-def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, worker_id):
+def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id):
     global _use_shared_memory
     _use_shared_memory = True
 
@@ -86,6 +86,11 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo
     random.seed(seed)
     torch.manual_seed(seed)
 
+    # Do not wait for putting thread to join when this worker exits. Otherwise,
+    # this worker may always be waiting to put and doesn't check index_queue
+    # and done_event for termination signal.
+    data_queue.cancel_join_thread()
+
     if init_fn is not None:
         init_fn(worker_id)
 
@@ -95,11 +100,13 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo
         try:
             r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
         except queue.Empty:
-            if watchdog.is_alive():
+            if watchdog.is_alive() and not done_event.is_set():
                 continue
             else:
                 break
-        if r is None:
+        # use done_event so that we can get faster exiting signal even if there
+        # are still indices in index_queue
+        if r is None or done_event.is_set():
             break
         idx, batch_indices = r
         try:
@@ -111,7 +118,7 @@ def _worker_loop(dataset, index_queue, data_queue, collate_fn, seed, init_fn, wo
             del samples
 
 
-def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id):
+def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id):
     if pin_memory:
         torch.cuda.set_device(device_id)
 
@@ -122,7 +129,7 @@ def _worker_manager_loop(in_queue, out_queue, done_event, pin_memory, device_id)
             if done_event.is_set():
                 return
             raise
-        if r is None:
+        if r is None or done_event.is_set():
             break
         if isinstance(r[1], ExceptionWrapper):
             out_queue.put(r)
@@ -242,7 +249,6 @@ def __init__(self, loader):
         self.num_workers = loader.num_workers
         self.pin_memory = loader.pin_memory and torch.cuda.is_available()
         self.timeout = loader.timeout
-        self.done_event = threading.Event()
 
         self.sample_iter = iter(self.batch_sampler)
 
@@ -252,35 +258,32 @@ def __init__(self, loader):
             self.worker_init_fn = loader.worker_init_fn
             self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)]
             self.worker_queue_idx = 0
-            self.worker_result_queue = multiprocessing.SimpleQueue()
+            self.worker_result_queue = multiprocessing.Queue()
             self.batches_outstanding = 0
             self.worker_pids_set = False
             self.shutdown = False
             self.send_idx = 0
             self.rcvd_idx = 0
             self.reorder_dict = {}
+            self.done_event = multiprocessing.Event()
 
             self.workers = [
                 multiprocessing.Process(
                     target=_worker_loop,
                     args=(self.dataset, self.index_queues[i],
-                          self.worker_result_queue, self.collate_fn, base_seed + i,
+                          self.worker_result_queue, self.done_event,
+                          self.collate_fn, base_seed + i,
                           self.worker_init_fn, i))
                 for i in range(self.num_workers)]
 
-            if self.pin_memory or self.timeout > 0:
+            if self.pin_memory:
                 self.data_queue = queue.Queue()
-                if self.pin_memory:
-                    maybe_device_id = torch.cuda.current_device()
-                else:
-                    # do not initialize cuda context if not necessary
-                    maybe_device_id = None
-                self.worker_manager_thread = threading.Thread(
-                    target=_worker_manager_loop,
+                self.pin_memory_thread = threading.Thread(
+                    target=_pin_memory_loop,
                     args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
-                          maybe_device_id))
-                self.worker_manager_thread.daemon = True
-                self.worker_manager_thread.start()
+                          torch.cuda.current_device()))
+                self.pin_memory_thread.daemon = True
+                self.pin_memory_thread.start()
             else:
                 self.data_queue = self.worker_result_queue
 
@@ -366,33 +369,29 @@ def __getstate__(self):
         raise NotImplementedError("_DataLoaderIter cannot be pickled")
 
     def _shutdown_workers(self):
-        try:
-            if not self.shutdown:
-                self.shutdown = True
-                self.done_event.set()
-                for q in self.index_queues:
-                    q.put(None)
-                # if some workers are waiting to put, make place for them
-                try:
-                    while not self.worker_result_queue.empty():
-                        self.worker_result_queue.get()
-                except (FileNotFoundError, ImportError):
-                    # Many weird errors can happen here due to Python
-                    # shutting down. These are more like obscure Python bugs.
-                    # FileNotFoundError can happen when we rebuild the fd
-                    # fetched from the queue but the socket is already closed
-                    # from the worker side.
-                    # ImportError can happen when the unpickler loads the
-                    # resource from `get`.
-                    pass
-                # done_event should be sufficient to exit worker_manager_thread,
-                # but be safe here and put another None
-                self.worker_result_queue.put(None)
-        finally:
-            # removes pids no matter what
+        if not self.shutdown:
+            self.shutdown = True
+            # removes pids from the C side data structure first so worker
+            # termination afterwards won't trigger false positive error report.
             if self.worker_pids_set:
                 _remove_worker_pids(id(self))
                 self.worker_pids_set = False
+            self.done_event.set()
+            if self.pin_memory:
+                # Sending `None` to `pin_memory_thread` must be before
+                # stopping worker processes because the workers may leave
+                # corrupted data in `worker_result_queue`, causing
+                # `pin_memory_thread` unable to read and terminate properly.
+                self.worker_result_queue.put(None)
+            # Workers can't be waiting to put be cause their output queue
+            # is a multiprocessing.Queue and its .put is non-blocking.
+            # They can only be waiting to get, so we put `None` here.
+            for q in self.index_queues:
+                q.put(None)
+            for w in self.workers:
+                w.join()
+            if self.pin_memory:
+                self.pin_memory_thread.join()
 
     def __del__(self):
         if self.num_workers > 0:

From 53083b83530441dcc08aad07886a72a04dda845e Mon Sep 17 00:00:00 2001
From: peter <peterghost86@gmail.com>
Date: Mon, 23 Jul 2018 06:35:39 -0700
Subject: [PATCH 23/23] Remove CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS and fix CUDA 8
 build on Windows (#9491) (#9491)

Summary:
Fixes #9092.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9491
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9693

Differential Revision: D8946850

Pulled By: ezyang

fbshipit-source-id: bd816f459ab70f6b4a0983305a1ce341bb633707
---
 .gitignore                                    |  1 +
 CMakeLists.txt                                |  4 ---
 aten/src/ATen/Backtrace.cpp                   |  1 +
 aten/src/ATen/Backtrace.h                     |  6 ++--
 aten/src/ATen/Device.h                        |  4 +--
 aten/src/ATen/ExpandUtils.h                   |  2 +-
 aten/src/ATen/Half.h                          |  4 +--
 aten/src/ATen/SmallVector.h                   |  4 ++-
 aten/src/ATen/SparseTensorImpl.h              |  2 +-
 aten/src/ATen/TensorOptions.h                 |  2 +-
 aten/src/ATen/cuda/CUDAStream.h               | 36 ++++++++++---------
 aten/src/ATen/function_wrapper.py             |  2 +-
 aten/src/TH/THGeneral.h.in                    |  3 ++
 aten/src/TH/THStorageClass.hpp                |  2 +-
 aten/src/TH/THStorageFunctions.hpp            | 18 +++++-----
 aten/src/TH/THTensor.hpp                      |  4 +--
 caffe2/utils/proto_wrap.cc                    |  7 ++--
 tools/autograd/templates/Functions.h          |  1 +
 torch/csrc/autograd/engine.h                  |  5 +--
 torch/csrc/autograd/function.h                |  3 +-
 torch/csrc/autograd/functions/basic_ops.h     |  7 ++--
 torch/csrc/autograd/functions/utils.h         |  7 ++--
 torch/csrc/autograd/grad_mode.h               |  6 ++--
 torch/csrc/autograd/profiler.h                | 15 ++++----
 torch/csrc/autograd/saved_variable.h          |  2 +-
 torch/csrc/autograd/variable.h                |  2 +-
 torch/csrc/jit/autodiff.h                     |  9 ++---
 torch/csrc/jit/export.h                       |  4 +--
 torch/csrc/jit/graph_executor.h               |  8 ++---
 torch/csrc/jit/import.h                       |  2 +-
 torch/csrc/jit/interned_strings.h             |  3 +-
 torch/csrc/jit/interpreter.h                  |  4 ++-
 torch/csrc/jit/ir.h                           | 23 ++++++------
 torch/csrc/jit/operator.h                     |  4 +--
 torch/csrc/jit/passes/batch_mm.h              |  2 +-
 torch/csrc/jit/passes/canonicalize.h          |  2 +-
 .../passes/common_subexpression_elimination.h |  2 +-
 .../jit/passes/create_autodiff_subgraphs.h    |  2 +-
 torch/csrc/jit/passes/dead_code_elimination.h |  4 +--
 torch/csrc/jit/passes/decompose_addmm.cpp     |  1 +
 torch/csrc/jit/passes/decompose_addmm.h       |  2 +-
 torch/csrc/jit/passes/erase_number_types.h    |  2 +-
 torch/csrc/jit/passes/graph_fuser.h           |  2 +-
 torch/csrc/jit/passes/inplace_check.h         |  2 +-
 torch/csrc/jit/passes/loop_unrolling.h        |  2 +-
 torch/csrc/jit/passes/lower_grad_of.h         |  2 +-
 torch/csrc/jit/passes/lower_tuples.h          |  2 +-
 torch/csrc/jit/passes/onnx.h                  |  4 +--
 torch/csrc/jit/passes/peephole.h              |  2 +-
 torch/csrc/jit/passes/remove_expands.cpp      |  1 +
 torch/csrc/jit/passes/remove_expands.h        |  2 +-
 torch/csrc/jit/passes/shape_analysis.h        |  4 ++-
 torch/csrc/jit/passes/specialize_undef.h      |  2 +-
 torch/csrc/jit/passes/to_batch.h              |  6 ++--
 torch/csrc/jit/script/compiler.h              | 20 +++++------
 torch/csrc/jit/test_jit.cpp                   |  2 +-
 torch/csrc/jit/tracer.h                       | 25 ++++++-------
 torch/csrc/jit/type.h                         | 19 +++++-----
 torch/csrc/onnx/onnx.h                        |  3 +-
 59 files changed, 176 insertions(+), 148 deletions(-)

diff --git a/.gitignore b/.gitignore
index 09e5ed86bbfe5..4d684662d166c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ torch/csrc/nn/THNN.cpp
 torch/csrc/nn/THNN.cwrap
 torch/lib/*.a*
 torch/lib/*.dll*
+torch/lib/*.exe*
 torch/lib/*.dylib*
 torch/lib/*.h
 torch/lib/*.lib
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e302869a8a8a..651e230ab35ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,10 +152,6 @@ endif()
 # ---[ CMake scripts + modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
-if (MSVC AND ${BUILD_SHARED_LIBS})
-  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-endif()
-
 # ---[ CMake build directories
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp
index a145c3b75c7ec..8ee61c73bf12c 100644
--- a/aten/src/ATen/Backtrace.cpp
+++ b/aten/src/ATen/Backtrace.cpp
@@ -1,4 +1,5 @@
 #include <ATen/optional.h>
+#include <ATen/Backtrace.h>
 
 #include <functional>
 #include <memory>
diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h
index 99f0923180583..347c430d61b75 100644
--- a/aten/src/ATen/Backtrace.h
+++ b/aten/src/ATen/Backtrace.h
@@ -4,9 +4,11 @@
 #include <string>
 #include <typeinfo>
 
+#include <ATen/ATenGeneral.h>
+
 namespace at {
 /// Utility to demangle a C++ symbol name.
-std::string demangle(const char* name);
+AT_API std::string demangle(const char* name);
 
 /// Returns the printable name of the type.
 template <typename T>
@@ -19,7 +21,7 @@ inline const char* demangle_type() {
 #endif // __GXX_RTTI
 }
 
-std::string get_backtrace(
+AT_API std::string get_backtrace(
     size_t frames_to_skip = 0,
     size_t maximum_number_of_frames = 64,
     bool skip_python_frames = true);
diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h
index 2d9c27f8d4cc3..4795b77a1bff0 100644
--- a/aten/src/ATen/Device.h
+++ b/aten/src/ATen/Device.h
@@ -111,8 +111,8 @@ struct Device {
 };
 } // namespace at
 
-std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
-std::ostream& operator<<(std::ostream& stream, const at::Device& device);
+AT_API std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
+AT_API std::ostream& operator<<(std::ostream& stream, const at::Device& device);
 
 namespace std {
   template<> struct hash<at::Device>
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 4ed5d6254320f..e0929784a5bda 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -10,7 +10,7 @@
 namespace at {
 
 AT_API std::vector<int64_t> infer_size(IntList a, IntList b);
-std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(
+AT_API std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(
     IntList tensor_sizes, IntList tensor_strides, IntList sizes);
 
 // avoid copy-construction of Tensor by using a reference_wrapper.
diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h
index 949a18848f726..d740008305cd8 100644
--- a/aten/src/ATen/Half.h
+++ b/aten/src/ATen/Half.h
@@ -35,8 +35,8 @@ namespace at {
 
 namespace detail {
 
-float halfbits2float(unsigned short bits);
-unsigned short float2halfbits(float value);
+AT_API float halfbits2float(unsigned short bits);
+AT_API unsigned short float2halfbits(float value);
 
 }
 
diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h
index 3a5926a06df8d..238a181296f4f 100644
--- a/aten/src/ATen/SmallVector.h
+++ b/aten/src/ATen/SmallVector.h
@@ -33,6 +33,8 @@
 #include <type_traits>
 #include <utility>
 
+#include <ATen/ATenGeneral.h>
+
 #if __GNUG__ && __GNUC__ < 5
 #define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
 #else
@@ -57,7 +59,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) {
 }
 
 /// This is all the non-templated stuff common to all SmallVectors.
-class SmallVectorBase {
+class AT_API SmallVectorBase {
 protected:
   void *BeginX, *EndX, *CapacityX;
 
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 081e48a07d722..2093b45339137 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -5,7 +5,7 @@
 #include "ATen/Error.h"
 
 namespace at {
-struct SparseTensorImpl : public TensorImpl {
+struct AT_API SparseTensorImpl : public TensorImpl {
   // Stored in COO format, indices + values.
 
   // Ideal INVARIANTS:
diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h
index c8aeca2bbeb90..53ad9d827c628 100644
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@@ -19,7 +19,7 @@ namespace at {
 /// `torch::TensorOptions` subclass of this `TensorOptions`, which changes
 /// `type()` to return a variable type instead of a tensor type, such that
 /// variables are created inside factory methods, instead of tensors.
-struct TensorOptions {
+struct AT_API TensorOptions {
   TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {}
 
   /// Constructs the `TensorOptions` with defaults taken from the thread local
diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h
index 72fc752b717a4..545bccfdfbcb7 100644
--- a/aten/src/ATen/cuda/CUDAStream.h
+++ b/aten/src/ATen/cuda/CUDAStream.h
@@ -5,6 +5,8 @@
 
 #include "cuda_runtime_api.h"
 
+#include <ATen/ATenGeneral.h>
+
 /*
 * A CUDA stream interface with no CUDA build dependency.
 *
@@ -23,31 +25,31 @@ namespace detail {
 
 // Pointer-based API (for internal use)
 // Note: ATen/Context is preferred to work with streams safely
-CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getDefaultStream();
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getDefaultStream();
 
-CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
+AT_API CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority);
 
-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
-CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream();
 
 // Note: these Unsafe gets should NEVER be used and are only here for legacy
 // purposes. Once those uses are gone they should be removed.
-CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
-CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
+AT_API CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
 
-void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
-void CUDAStream_uncheckedSetStreamOnDevice(
+AT_API void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
+AT_API void CUDAStream_uncheckedSetStreamOnDevice(
     int64_t device,
     CUDAStreamInternals* internals);
-void CUDAStream_setStream(CUDAStreamInternals* internals);
+AT_API void CUDAStream_setStream(CUDAStreamInternals* internals);
 
-cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
-int64_t CUDAStream_device(CUDAStreamInternals*);
+AT_API cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
+AT_API int64_t CUDAStream_device(CUDAStreamInternals*);
 
-bool CUDAStream_retain(CUDAStreamInternals*);
-void CUDAStream_free(CUDAStreamInternals*&);
-void CUDAStream_uncheckedFree(CUDAStreamInternals*&);
+AT_API bool CUDAStream_retain(CUDAStreamInternals*);
+AT_API void CUDAStream_free(CUDAStreamInternals*&);
+AT_API void CUDAStream_uncheckedFree(CUDAStreamInternals*&);
 
 } // namespace detail
 
@@ -71,10 +73,10 @@ struct CUDAStream {
   ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); }
 
   // Copy constructor
-  CUDAStream(const CUDAStream& other);
+  AT_API CUDAStream(const CUDAStream& other);
 
   // Move constructor
-  CUDAStream(CUDAStream&& other);
+  AT_API CUDAStream(CUDAStream&& other);
 
   // Assignment operator
   CUDAStream& operator=(CUDAStream other) noexcept {
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 7a5e6e40760ec..1c06654fe891b 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -143,7 +143,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 # add a native declaration for a native function
 NATIVE_DECLARATION = CodeTemplate("""\
-${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
+AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
 """)
 
 # special method definition for factory functions in Functions.h
diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in
index 9038dfb2b1092..2a1c33f13679d 100644
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@@ -35,11 +35,14 @@
 #ifdef _WIN32
 # if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #  define TH_API TH_EXTERNC __declspec(dllexport)
+#  define TH_CPP_API __declspec(dllexport)
 # else
 #  define TH_API TH_EXTERNC __declspec(dllimport)
+#  define TH_CPP_API __declspec(dllimport)
 # endif
 #else
 # define TH_API TH_EXTERNC
+# define TH_CPP_API
 #endif
 
 #ifdef _WIN32
diff --git a/aten/src/TH/THStorageClass.hpp b/aten/src/TH/THStorageClass.hpp
index 99031f635b703..f9cc84915e319 100644
--- a/aten/src/TH/THStorageClass.hpp
+++ b/aten/src/TH/THStorageClass.hpp
@@ -37,7 +37,7 @@ struct THFinalizer {
   virtual ~THFinalizer() {};
 };
 
-struct THStorage
+struct TH_CPP_API THStorage
 {
   THStorage() = delete;
   THStorage(at::ScalarType, ptrdiff_t, at::DataPtr, at::Allocator*, char);
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 9ef48dcfbd870..b4b1d4c51e882 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -33,14 +33,14 @@
 //    If it is not, you must report that the storage is dead.
 //
 
-ptrdiff_t THStorage_size(const THStorage *self);
+TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
-void THStorage_setFlag(THStorage *storage, const char flag);
-void THStorage_clearFlag(THStorage *storage, const char flag);
-void THStorage_retain(THStorage *storage);
-void THStorage_resize(THStorage *storage, ptrdiff_t size);
-void THStorage_swap(THStorage *storage1, THStorage *storage2);
+TH_API void THStorage_setFlag(THStorage *storage, const char flag);
+TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
+TH_API void THStorage_retain(THStorage *storage);
+TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
+TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);
 
-void THStorage_weakRetain(THStorage *weak_storage);
-void THStorage_weakFree(THStorage *weak_storage);
-THStorage* THStorage_weakLock(THStorage *weak_storage);
+TH_API void THStorage_weakRetain(THStorage *weak_storage);
+TH_API void THStorage_weakFree(THStorage *weak_storage);
+TH_API THStorage* THStorage_weakLock(THStorage *weak_storage);
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 0962803e3821a..7912506cec107 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -149,5 +149,5 @@ inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage)
 }
 
 TH_API void THTensor_free(THTensor *self);
-at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
-                                                           at::IntList newshape);
+TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
+                                                                      at::IntList newshape);
diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc
index c0a85af480740..3bcacd8809bb8 100644
--- a/caffe2/utils/proto_wrap.cc
+++ b/caffe2/utils/proto_wrap.cc
@@ -1,4 +1,5 @@
 #include "caffe2/utils/proto_wrap.h"
+#include "caffe2/core/common.h"
 
 #include <google/protobuf/stubs/common.h>
 #include <google/protobuf/generated_message_util.h>
@@ -8,7 +9,7 @@ namespace caffe {
 // Caffe wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
@@ -19,7 +20,7 @@ namespace ONNX_NAMESPACE {
 // ONNX wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
@@ -30,7 +31,7 @@ namespace caffe2 {
 // Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
 // used to avoid duplicated global variable in the case when protobuf
 // is built with hidden visibility.
-const ::std::string& GetEmptyStringAlreadyInited() {
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
   return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
 }
 
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index 7f3e5f9c05509..ae95bf7197770 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -5,6 +5,7 @@
 #include <ATen/ATen.h>
 #include <ATen/TensorGeometry.h>
 
+#include "torch/csrc/THP_export.h"
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/autograd/saved_variable.h"
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 734e5b9da8104..db8b3357ac253 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -3,6 +3,7 @@
 // Engine implements backpropagation from output variables and their gradients
 // to "root" variables (variables created by the user with requires_grad=True).
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/input_buffer.h"
 #include "torch/csrc/autograd/anomaly_mode.h"
@@ -24,7 +25,7 @@ struct GraphTask;
 namespace torch { namespace autograd {
 // A single instance of this struct should be created through the whole process lifetime.
 // The worker thread creation logic and Engine's destructor rely on this.
-struct Engine {
+struct TORCH_API Engine {
   /// Returns a reference to a static `Engine` instance.
   static Engine& get_default_engine();
 
@@ -67,6 +68,6 @@ struct Engine {
 
 // allow python_engine to override the default engine when it loads
 typedef Engine& (*EngineStub)(void);
-void set_default_engine_stub(EngineStub stub);
+TORCH_API void set_default_engine_stub(EngineStub stub);
 
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b65a7063f15af..aea5a03201fd4 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "torch/csrc/assertions.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/autograd/edge.h"
 #include "torch/csrc/autograd/grad_mode.h"
 #include "torch/csrc/autograd/anomaly_mode.h"
@@ -84,7 +85,7 @@ void deleteFunction(Function* function);
 /// are created in one thread and `C` is created in a new thread, there are *no
 /// guarantees* w.r.t. the ordering of `C` relative to `A` or `B`.
 ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-struct Function : std::enable_shared_from_this<Function> {
+struct TORCH_API Function : std::enable_shared_from_this<Function> {
  public:
   /// Construct a new `Function` with `num_inputs` inputs and the given
   /// `next_edges`. sequence_nr is a (currently THE) hint to prioritization
diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h
index 7c92b42a11b05..ad7a9e52d3757 100644
--- a/torch/csrc/autograd/functions/basic_ops.h
+++ b/torch/csrc/autograd/functions/basic_ops.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/autograd/symbolic.h"
@@ -10,7 +11,7 @@
 
 namespace torch { namespace autograd {
 
-struct Error : public Function {
+struct TORCH_API Error : public Function {
   Error(std::string msg, edge_list&& next_edges)
     : Function(std::move(next_edges))
     , msg(std::move(msg)) {}
@@ -24,7 +25,7 @@ struct Error : public Function {
 };
 
 // Identity in forward, Error in backward. Used to implement @once_differentiable
-struct DelayedError : public Function {
+struct TORCH_API DelayedError : public Function {
   DelayedError(std::string msg, int num_inputs)
     : msg(std::move(msg)) {
       for (int i = 0; i < num_inputs; i++)
@@ -36,7 +37,7 @@ struct DelayedError : public Function {
   std::string msg;
 };
 
-struct GraphRoot : public Function {
+struct TORCH_API GraphRoot : public Function {
   GraphRoot(edge_list functions, variable_list inputs)
       : Function(std::move(functions)),
         outputs(std::move(inputs)) {}
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index a1b7ee74bf8ec..bad48b221eaf0 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/utils/variadic.h>
@@ -18,12 +19,12 @@ using function_constructor = std::function<std::shared_ptr<Function>(edge_list&&
  * Wraps the tensor outputs in variables and creates the grad_fn and sets the
  * grad_fn if necessary.
  */
-variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
-                           function_constructor ctr);
+TORCH_API variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
+                                     function_constructor ctr);
 
 ///  Checks that inputs contains exactly `args` items and that the first `required_args`
 /// items are not nullptr. If not specified, `required_args` defaults to `args`.
-void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1);
+TORCH_API void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1);
 
 struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
   bool out = false;
diff --git a/torch/csrc/autograd/grad_mode.h b/torch/csrc/autograd/grad_mode.h
index e7d15446bee1f..ebdc86e74ca70 100644
--- a/torch/csrc/autograd/grad_mode.h
+++ b/torch/csrc/autograd/grad_mode.h
@@ -1,15 +1,17 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 namespace torch { namespace autograd {
 
-struct GradMode {
+struct TORCH_API GradMode {
   static bool is_enabled();
   static void set_enabled(bool enabled);
 };
 
 // A RAII, thread local (!) guard that enables or disables grad mode upon
 // construction, and sets it back to the original value upon destruction.
-struct AutoGradMode {
+struct TORCH_API AutoGradMode {
   AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
     GradMode::set_enabled(enabled);
   }
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index c3abdfc3b8f2b..dd77dc193ba9b 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -15,6 +15,7 @@
 #include <forward_list>
 #include <tuple>
 #include "ATen/ATen.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/cuda/cuda_check.h"
 #ifdef USE_CUDA
 #include "ATen/cuda/CUDAContext.h"
@@ -163,12 +164,12 @@ enum class ProfilerState {
     NVTX,  // only emit NVTX markers
 };
 
-RangeEventList& getEventList();
-void mark(std::string name, bool include_cuda = true);
-void pushRange(std::string name);
-void popRange();
+TORCH_API RangeEventList& getEventList();
+TORCH_API void mark(std::string name, bool include_cuda = true);
+TORCH_API void pushRange(std::string name);
+TORCH_API void popRange();
 
-struct RecordFunction {
+struct TORCH_API RecordFunction {
   explicit RecordFunction(Function* fn);
 
   explicit RecordFunction(std::string name);
@@ -184,8 +185,8 @@ struct RecordFunction {
 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
-void enableProfiler(ProfilerState state);
-thread_event_lists disableProfiler();
+TORCH_API void enableProfiler(ProfilerState state);
+TORCH_API thread_event_lists disableProfiler();
 
 } // namespace profiler
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index ff5a36ba04c03..61a1d3b3eac17 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -18,7 +18,7 @@ TORCH_API extern const char* ERR_BACKWARD_TWICE;
 
 /// A snapshot of a variable at a certain version. A `SavedVariable` stores
 /// enough information to reconstruct a variable from a certain point in time.
-class SavedVariable {
+class TORCH_API SavedVariable {
  public:
   SavedVariable() = default;
   SavedVariable(const Variable& variable, bool is_output);
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 2def489e3ae54..6b40eea4a2561 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -77,7 +77,7 @@ struct Function;
 /// free function instead. To create a view variable, use `make_variable_view`.
 ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-struct Variable : public at::Tensor {
+struct TORCH_API Variable : public at::Tensor {
   /// Default constructor.
   Variable() = default;
 
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index e0dd63c925ef2..6dd2be9db0e77 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/jit/ir.h"
 
 #include <ATen/ATen.h>
@@ -84,11 +85,11 @@ struct Gradient {
 };
 // XXX: When calling this function, graph should have complete type information.
 // Use the shape analysis pass to fill in the gaps if it doesn't.
-Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad);
+TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad);
 
 // can we take a derivative of this node symbolically?
-bool isDifferentiable(Node * n);
-bool isDifferentiable(Graph & g);
-bool isZero(Value * v);
+TORCH_API bool isDifferentiable(Node * n);
+TORCH_API bool isDifferentiable(Graph & g);
+TORCH_API bool isZero(Value * v);
 
 }}
diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h
index 95758dc9aa392..d0c6212a324a8 100644
--- a/torch/csrc/jit/export.h
+++ b/torch/csrc/jit/export.h
@@ -15,7 +15,7 @@ namespace torch { namespace jit {
 // file contents being the raw tensor data.
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
-std::tuple<std::string, RawDataExportMap> ExportGraph(
+TORCH_API std::tuple<std::string, RawDataExportMap> ExportGraph(
     const std::shared_ptr<Graph>& graph,
     const std::vector<at::Tensor>& initializers,
     int64_t onnx_opset_version,
@@ -24,7 +24,7 @@ std::tuple<std::string, RawDataExportMap> ExportGraph(
       = ::torch::onnx::OperatorExportTypes::ONNX);
 
 // For testing purposes
-std::string PrettyPrintExportedGraph(
+TORCH_API std::string PrettyPrintExportedGraph(
     const std::shared_ptr<Graph>& graph,
     const std::vector<at::Tensor> & initializers,
     int64_t onnx_opset_version,
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index affcd38a065c9..d78076ab6484f 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -33,7 +33,7 @@ struct GraphExecutorState {
 };
 
 struct GraphExecutorImpl;
-struct GraphExecutor {
+struct TORCH_API GraphExecutor {
   GraphExecutor() {}
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
@@ -51,17 +51,17 @@ struct GraphExecutor {
 
 // These passes need to run before it is valid to pass to the interpreter
 // regardless of whether sizes have been specialized or not.
-void runRequiredPasses(const std::shared_ptr<Graph>& g);
+TORCH_API void runRequiredPasses(const std::shared_ptr<Graph>& g);
 
 // specialize 'graph' to the types, sizes, and other properties described in spec
 // this prepares the graph for execution, including running runRequiredPasses,
 // but the execution only remains valid for tensors whose properties match spec
 // otherwise running the graph will have undefined results.
-void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec);
+TORCH_API void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec);
 
 // apply standard optimizations. if graphMustSupportVariables=false then
 // then the passes are allowed to modify the graph in ways that make it no longer
 // work with tensors that have requires_grad=True
-void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables);
+TORCH_API void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables);
 
 }}
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index fa1878d827b53..d593896f2c792 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph, std::vector<at::Tensor> & initializers);
+TORCH_API std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph, std::vector<at::Tensor> & initializers);
 
 }}
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index b61a49be846dc..a0d6a7a7fef50 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <algorithm>
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/jit/generated/aten_interned_strings.h"
 
 namespace torch { namespace jit {
@@ -132,7 +133,7 @@ static const std::string domain_prefix = "org.pytorch.";
 // A Symbol is like an interned string, but with a little extra
 // structure; it is namespaced via SymbolNamespace and the resulting
 // intern pointers support efficient namespace testing.
-struct Symbol {
+struct TORCH_API Symbol {
   explicit constexpr Symbol() : value(0) {};
   explicit constexpr Symbol(unique_t uniq)
   : value(uniq) {}
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
index b9085528fbb44..28b3404cf463f 100644
--- a/torch/csrc/jit/interpreter.h
+++ b/torch/csrc/jit/interpreter.h
@@ -3,6 +3,8 @@
 #include <vector>
 #include "ATen/optional.h"
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 namespace at {
   struct Tensor;
 }
@@ -22,7 +24,7 @@ struct TensorType;
 struct IValue;
 using Stack = std::vector<IValue>;
 
-struct Code {
+struct TORCH_API Code {
   Code()
     : pImpl(nullptr) {}
   Code(std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 153828497d1bf..a2f71f702bc9b 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -16,6 +16,7 @@
 #include "torch/csrc/utils/python_stub.h"
 
 #include "torch/csrc/assertions.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 
 #include <ATen/ATen.h>
 #include "ATen/ArrayRef.h"
@@ -51,9 +52,9 @@ struct Node;
 // Tensor or an opaque Handle object, as determined by type().
 struct Value;
 
-std::ostream& operator<<(std::ostream & out, const Graph & g);
-std::ostream& operator<<(std::ostream & out, const Type & t);
-std::ostream& operator<<(std::ostream & out, const Node & t);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t);
 
 // A list of nodes, with inputs and outputs
 struct Block;
@@ -195,7 +196,7 @@ struct Value {
   bool hasUniqueName() const {
     return unique_name_ != "";
   }
-  Value* setUniqueName(const std::string & name);
+  TORCH_API Value* setUniqueName(const std::string & name);
   std::string uniqueName() const {
     if (hasUniqueName())
       return unique_name_;
@@ -813,7 +814,7 @@ struct Block {
   // to the inputs, nodes, and outputs of this block
   // value_map is used whenever a node in src references a free variable
   // in src to look up its corresponding value
-  void cloneFrom(Block * src, std::function<Value*(Value*)> value_map);
+  TORCH_API void cloneFrom(Block * src, std::function<Value*(Value*)> value_map);
 private:
   // should only be called in the constructor
   Node* initOutput(Node* p) {
@@ -1069,9 +1070,9 @@ friend struct Block;
   }
 
   // Checks well-formedness and invariants of graph
-  void lint() const;
+  TORCH_API void lint() const;
   // for use in debugger
-  void dump() const;
+  TORCH_API void dump() const;
 
   ~Graph() {
     for (const Node * n : all_nodes)
@@ -1089,7 +1090,7 @@ friend struct Block;
   }
 
   friend std::ostream& operator<<(std::ostream & out, const Graph & g);
-  std::shared_ptr<Graph> copy();
+  TORCH_API std::shared_ptr<Graph> copy();
 
 private:
 
@@ -1338,8 +1339,8 @@ struct PythonOp : public Node {
 
 };
 // patched in when python bindings are loaded
-PythonOp* allocPythonOp(Graph* g);
-void setAllocPythonOp(PythonOp* (*v)(Graph* g));
+TORCH_API PythonOp* allocPythonOp(Graph* g);
+TORCH_API void setAllocPythonOp(PythonOp* (*v)(Graph* g));
 
 inline Node* Graph::createPythonOp(
     THPObjectPtr&& pyobj,
@@ -1365,6 +1366,6 @@ inline const_graph_node_list_iterator Node::reverseIterator() const {
   return iterator().reverse();
 }
 
-void LintGraph(std::shared_ptr<Graph>& graph);
+TORCH_API void LintGraph(std::shared_ptr<Graph>& graph);
 
 }} // namespace torch::jit
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
index 50ec147090be3..74f0ec95d8bce 100644
--- a/torch/csrc/jit/operator.h
+++ b/torch/csrc/jit/operator.h
@@ -13,7 +13,7 @@ FunctionSchema parseSchema(const std::string& decl);
 
 using OperationCreator = std::function<Operation(Node*)>;
 
-struct Operator {
+struct TORCH_API Operator {
   Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
     : schema(std::move(schema))
     , op(std::move(op))
@@ -65,7 +65,7 @@ void registerOperator(Operator&& op);
 // XXX: this function is meant to be used with string literals only!
 Operator& sig(const char *signature_literal);
 
-struct RegisterOperators {
+struct TORCH_API RegisterOperators {
   RegisterOperators(std::vector<Operator> operators) {
     for(Operator& o : operators) {
       registerOperator(std::move(o));
diff --git a/torch/csrc/jit/passes/batch_mm.h b/torch/csrc/jit/passes/batch_mm.h
index efb635d2bbc93..06bb34a2c1eae 100644
--- a/torch/csrc/jit/passes/batch_mm.h
+++ b/torch/csrc/jit/passes/batch_mm.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void BatchMM(std::shared_ptr<Graph>& graph);
+TORCH_API void BatchMM(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/canonicalize.h b/torch/csrc/jit/passes/canonicalize.h
index 09a98832bda74..1430432d3317b 100644
--- a/torch/csrc/jit/passes/canonicalize.h
+++ b/torch/csrc/jit/passes/canonicalize.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-std::shared_ptr<Graph> Canonicalize(const std::shared_ptr<Graph>& graph);
+TORCH_API std::shared_ptr<Graph> Canonicalize(const std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/common_subexpression_elimination.h b/torch/csrc/jit/passes/common_subexpression_elimination.h
index 483c573d3d157..64ae4f6bd9ca8 100644
--- a/torch/csrc/jit/passes/common_subexpression_elimination.h
+++ b/torch/csrc/jit/passes/common_subexpression_elimination.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void EliminateCommonSubexpression(std::shared_ptr<Graph>& graph);
+TORCH_API void EliminateCommonSubexpression(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
index 758225769e8b5..b76ee82e52970 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.h
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -8,6 +8,6 @@ struct Graph;
 // insert GraphExecutor nodes that group together
 // subgraphs that are differentiable by the jit's autodiff passes
 // threshold - minimum number of nodes that will appear in a block
-void CreateAutodiffSubgraphs(Graph & graph, size_t threshold = 2);
+TORCH_API void CreateAutodiffSubgraphs(Graph & graph, size_t threshold = 2);
 
 }}
diff --git a/torch/csrc/jit/passes/dead_code_elimination.h b/torch/csrc/jit/passes/dead_code_elimination.h
index 51c92ab07200d..9ae89f9b56150 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.h
+++ b/torch/csrc/jit/passes/dead_code_elimination.h
@@ -4,7 +4,7 @@
 
 namespace torch { namespace jit {
 
-void EliminateDeadCode(const std::shared_ptr<Graph>& graph);
-void EliminateDeadCode(Block *block, bool recurse=true);
+TORCH_API void EliminateDeadCode(const std::shared_ptr<Graph>& graph);
+TORCH_API void EliminateDeadCode(Block *block, bool recurse=true);
 
 }}
diff --git a/torch/csrc/jit/passes/decompose_addmm.cpp b/torch/csrc/jit/passes/decompose_addmm.cpp
index 85ad0770df77b..1a0dd94f9960d 100644
--- a/torch/csrc/jit/passes/decompose_addmm.cpp
+++ b/torch/csrc/jit/passes/decompose_addmm.cpp
@@ -1,4 +1,5 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/decompose_addmm.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/tensor_conversions.h"
 
diff --git a/torch/csrc/jit/passes/decompose_addmm.h b/torch/csrc/jit/passes/decompose_addmm.h
index 500d4a2e69f57..1153dcac04196 100644
--- a/torch/csrc/jit/passes/decompose_addmm.h
+++ b/torch/csrc/jit/passes/decompose_addmm.h
@@ -9,6 +9,6 @@ namespace torch { namespace jit {
 //
 // In the future, if we need more passes like this, we should convert this
 // into a generic canonicalization pass.
-void DecomposeAddmm(const std::shared_ptr<Graph>& graph);
+TORCH_API void DecomposeAddmm(const std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/erase_number_types.h b/torch/csrc/jit/passes/erase_number_types.h
index a736346dfffce..5ec43ce575b86 100644
--- a/torch/csrc/jit/passes/erase_number_types.h
+++ b/torch/csrc/jit/passes/erase_number_types.h
@@ -15,6 +15,6 @@ namespace torch { namespace jit {
 // - prim::TensorToNum, and prim::NumToTensor nodes are erased.
 //
 // The pass assumes that DCE will be called sometime after.
-void EraseNumberTypes(const std::shared_ptr<Graph>& graph);
+TORCH_API void EraseNumberTypes(const std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/graph_fuser.h b/torch/csrc/jit/passes/graph_fuser.h
index 17e730ea67566..1c1bbfe292ff5 100644
--- a/torch/csrc/jit/passes/graph_fuser.h
+++ b/torch/csrc/jit/passes/graph_fuser.h
@@ -6,6 +6,6 @@ namespace torch { namespace jit {
 
 // NB: Be sure to run DCE before fusion, because dead instructions
 // can prevent fusion opportunities from being exploited.
-void FuseGraph(std::shared_ptr<Graph>& graph);
+TORCH_API void FuseGraph(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/inplace_check.h b/torch/csrc/jit/passes/inplace_check.h
index cae7805188b7b..3ad296b646898 100644
--- a/torch/csrc/jit/passes/inplace_check.h
+++ b/torch/csrc/jit/passes/inplace_check.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void CheckInplace(std::shared_ptr<Graph>& graph);
+TORCH_API void CheckInplace(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/loop_unrolling.h b/torch/csrc/jit/passes/loop_unrolling.h
index 4ca1fd761f945..d95a179524743 100644
--- a/torch/csrc/jit/passes/loop_unrolling.h
+++ b/torch/csrc/jit/passes/loop_unrolling.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void UnrollLoops(std::shared_ptr<Graph>& graph);
+TORCH_API void UnrollLoops(std::shared_ptr<Graph>& graph);
 
 }} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h
index 104adcb047130..a0a881e3002ed 100644
--- a/torch/csrc/jit/passes/lower_grad_of.h
+++ b/torch/csrc/jit/passes/lower_grad_of.h
@@ -10,6 +10,6 @@ namespace torch { namespace jit {
 //  outputs = <original_computation>
 // else:
 //  outputs = undefineds
-void LowerGradOf(Graph& graph);
+TORCH_API void LowerGradOf(Graph& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/lower_tuples.h b/torch/csrc/jit/passes/lower_tuples.h
index 6dda8c2028bc9..d4eed60c1c145 100644
--- a/torch/csrc/jit/passes/lower_tuples.h
+++ b/torch/csrc/jit/passes/lower_tuples.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void LowerTuples(std::shared_ptr<Graph>& graph);
+TORCH_API void LowerTuples(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h
index a58d421a458d2..a02be6906542c 100644
--- a/torch/csrc/jit/passes/onnx.h
+++ b/torch/csrc/jit/passes/onnx.h
@@ -5,7 +5,7 @@
 
 namespace torch { namespace jit {
 
-std::shared_ptr<Graph> ToONNX(std::shared_ptr<Graph>& state, ::torch::onnx::OperatorExportTypes operator_export_type);
-void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map<Value*, Value*> env);
+TORCH_API std::shared_ptr<Graph> ToONNX(std::shared_ptr<Graph>& state, ::torch::onnx::OperatorExportTypes operator_export_type);
+TORCH_API void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExportTypes operator_export_type, std::unordered_map<Value*, Value*> env);
 
 }}
diff --git a/torch/csrc/jit/passes/peephole.h b/torch/csrc/jit/passes/peephole.h
index 2ab80a221dc09..c2a6af708fcc5 100644
--- a/torch/csrc/jit/passes/peephole.h
+++ b/torch/csrc/jit/passes/peephole.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void PeepholeOptimize(std::shared_ptr<Graph>& graph);
+TORCH_API void PeepholeOptimize(std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/remove_expands.cpp b/torch/csrc/jit/passes/remove_expands.cpp
index 8c679db8c2e52..f0f591cac59ec 100644
--- a/torch/csrc/jit/passes/remove_expands.cpp
+++ b/torch/csrc/jit/passes/remove_expands.cpp
@@ -1,3 +1,4 @@
+#include "torch/csrc/jit/passes/remove_expands.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 
 namespace torch { namespace jit {
diff --git a/torch/csrc/jit/passes/remove_expands.h b/torch/csrc/jit/passes/remove_expands.h
index 06885609f684f..ebcd4634c2f46 100644
--- a/torch/csrc/jit/passes/remove_expands.h
+++ b/torch/csrc/jit/passes/remove_expands.h
@@ -4,6 +4,6 @@
 
 namespace torch { namespace jit {
 
-void RemoveExpands(const std::shared_ptr<Graph>& graph);
+TORCH_API void RemoveExpands(const std::shared_ptr<Graph>& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/shape_analysis.h b/torch/csrc/jit/passes/shape_analysis.h
index 4a36406758872..1b38cbbe5739a 100644
--- a/torch/csrc/jit/passes/shape_analysis.h
+++ b/torch/csrc/jit/passes/shape_analysis.h
@@ -1,8 +1,10 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 namespace torch { namespace jit {
 struct Graph;
 struct ArgumentSpec;
-void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec);
+TORCH_API void PropagateInputShapes(Graph & graph, const ArgumentSpec & spec);
 
 }}
diff --git a/torch/csrc/jit/passes/specialize_undef.h b/torch/csrc/jit/passes/specialize_undef.h
index c468ceb7e00a9..54b68683eaf17 100644
--- a/torch/csrc/jit/passes/specialize_undef.h
+++ b/torch/csrc/jit/passes/specialize_undef.h
@@ -11,6 +11,6 @@ namespace torch { namespace jit {
 // operations generated by the symbolic autodiff code and cleans up
 // AutogradAdds when possible. Outputs of other nodes are conservatively
 // marked Unknown and not optimized.
-void specializeUndef(Graph & g, const std::vector<bool>& defined);
+TORCH_API void specializeUndef(Graph & g, const std::vector<bool>& defined);
 
 }}
diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h
index 1d3113cd8cddc..23c23a0632b31 100644
--- a/torch/csrc/jit/passes/to_batch.h
+++ b/torch/csrc/jit/passes/to_batch.h
@@ -11,9 +11,9 @@ class ToBatch {
   std::unordered_map<Value*, std::vector<Value*>> batch_map;
 public:
   static std::unordered_map<std::string, std::shared_ptr<Graph>> batch_operator_table;
-  void toBatch(Block* block, Block* res_block);
+  TORCH_API void toBatch(Block* block, Block* res_block);
 };
 
-std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph);
-void initRegisterBatchOpsBindings(PyObject* module);
+TORCH_API std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph);
+TORCH_API void initRegisterBatchOpsBindings(PyObject* module);
 }}
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index f5b80e1264524..e7f115740c0a2 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -83,7 +83,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
 
 // most things in the environment are just simple value types
 // and not special python syntax sugar types
-struct SimpleValue : public SugaredValue {
+struct TORCH_API SimpleValue : public SugaredValue {
   SimpleValue(Value * value)
   : value(value) {}
   virtual std::string kind() const override {
@@ -101,7 +101,7 @@ struct SimpleValue : public SugaredValue {
   Value* value;
 };
 
-struct BuiltinFunction : public SugaredValue {
+struct TORCH_API BuiltinFunction : public SugaredValue {
   BuiltinFunction(const std::string& name, at::optional<NamedValue> value)
     : name(name), value(std::move(value)) {}
   std::string name;
@@ -121,7 +121,7 @@ struct BuiltinFunction : public SugaredValue {
 };
 
 using Resolver = std::function<std::shared_ptr<SugaredValue>(const std::string& name)>;
-void defineMethodsInModule(
+TORCH_API void defineMethodsInModule(
   Module & m,
   const std::vector<Def>& definitions,
   const std::vector<Resolver>& resolvers, /* determines how we handle free variables in each definition*/
@@ -129,20 +129,20 @@ void defineMethodsInModule(
 );
 
 // same as above but parse the definitions from source
-void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, std::shared_ptr<SugaredValue> self);
-std::shared_ptr<Graph> compileFunction(Def def, const Resolver& resolver);
+TORCH_API void defineMethodsInModule(Module & m, const std::string& source, const Resolver& resolver, std::shared_ptr<SugaredValue> self);
+TORCH_API std::shared_ptr<Graph> compileFunction(Def def, const Resolver& resolver);
 
 // pack outputs of a function following python rules. If there is a single value return
 // a SimpleValue, otherwise pack all the values into a Tuple.
-std::shared_ptr<SugaredValue> packOutputs(Graph& g, at::ArrayRef<Value*> values);
-std::vector<Value*> inlineCallTo(Graph& g, Graph& callee, ArrayRef<Value*> inputs);
-void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what);
-void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> values);
+TORCH_API std::shared_ptr<SugaredValue> packOutputs(Graph& g, at::ArrayRef<Value*> values);
+TORCH_API std::vector<Value*> inlineCallTo(Graph& g, Graph& callee, ArrayRef<Value*> inputs);
+TORCH_API void ensureSizeMatches(SourceRange loc, size_t expected, size_t actual, const std::string& what);
+TORCH_API void ensureTensors(const SourceRange& range, at::ArrayRef<Value*> values);
 
 // try to match a list if inputs and keyword 'attributes' to this schema,
 // if it works return the flat list of positional inputs to the call
 // if it returns nullopt, then failure_messages contains a good error report
-at::optional<std::vector<Value*>> tryMatchSchema(
+TORCH_API at::optional<std::vector<Value*>> tryMatchSchema(
   const FunctionSchema& schema,
   const SourceRange& loc,
   Graph& graph,
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 7b784f093330b..9aa31e0b98bbf 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -953,7 +953,7 @@ void testProto() {
   proto.set_producer_name("foo");
 }
 
-std::string runJITCPPTests() {
+TORCH_API std::string runJITCPPTests() {
   std::stringstream out;
   testIValue();
   testControlFlow();
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 2b8f32e8034f9..7198dd1aba150 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -2,6 +2,7 @@
 
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/assertions.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/autograd/function_hook.h"
@@ -19,7 +20,7 @@ namespace torch { namespace jit { namespace tracer {
 using torch::autograd::Variable;
 using variable_list = std::vector<Variable>;
 
-struct TracingState : public std::enable_shared_from_this<TracingState> {
+struct TORCH_API TracingState : public std::enable_shared_from_this<TracingState> {
   TracingState();
   ~TracingState();
 
@@ -59,10 +60,10 @@ struct ArgumentStash {
     return stash.intlists.empty();
   }
 
-  static void stashIntListElem(const std::string& arg_name,
-                               size_t size,
-                               size_t idx,
-                               const Variable& var);
+  TORCH_API static void stashIntListElem(const std::string& arg_name,
+                                         size_t size,
+                                         size_t idx,
+                                         const Variable& var);
 
   static bool hasIntList(const std::string& arg_name) {
     return stash.intlists.count(arg_name) > 0;
@@ -80,8 +81,8 @@ struct ArgumentStash {
 };
 
 // Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled.
-const std::shared_ptr<TracingState>& getTracingState();
-void setTracingState(std::shared_ptr<TracingState> state);
+TORCH_API const std::shared_ptr<TracingState>& getTracingState();
+TORCH_API void setTracingState(std::shared_ptr<TracingState> state);
 
 inline bool isTracing() {
   return static_cast<bool>(getTracingState());
@@ -191,11 +192,11 @@ struct PreTraceInfo {
   Node *n;
 };
 
-PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef<Variable> inputs);
-void postRecordTrace(const PreTraceInfo& info, at::ArrayRef<Variable> outputs);
+TORCH_API PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef<Variable> inputs);
+TORCH_API void postRecordTrace(const PreTraceInfo& info, at::ArrayRef<Variable> outputs);
 
-void recordSourceLocation(Node* n);
-void setRecordSourceLocation(void (*v)(Node*));
+TORCH_API void recordSourceLocation(Node* n);
+TORCH_API void setRecordSourceLocation(void (*v)(Node*));
 
 // We must record the nodes of inputs before we actually carry out
 // the operation, because an inplace operation may destroy the information
@@ -221,6 +222,6 @@ PreTraceInfo makePreTraceInfo(at::ArrayRef<Variable> inputs, F ctor) {
   return info;
 }
 
-autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
+TORCH_API autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
 
 }}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 18949f28dcddc..fe06ec7833a11 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -2,6 +2,7 @@
 
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/assertions.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 
 #include <ATen/ATen.h>
 
@@ -29,7 +30,7 @@ struct Type;
 using TypePtr = std::shared_ptr<Type>;
 
 
-struct Type : std::enable_shared_from_this<Type> {
+struct TORCH_API Type : std::enable_shared_from_this<Type> {
 
 private:
   TypeKind kind_;
@@ -89,7 +90,7 @@ inline bool operator!=(const Type & lhs, const Type & rhs) {
 }
 
 // This node represents a single Tensor value, with an unknown shape.
-struct DynamicType : public Type {
+struct TORCH_API DynamicType : public Type {
   DynamicType()
   : Type(TypeKind::DynamicType) {}
   bool operator==(const Type& rhs) const override {
@@ -106,7 +107,7 @@ struct DynamicType : public Type {
 struct TensorType;
 using TensorTypePtr = std::shared_ptr<TensorType>;
 // This node represents a single Tensor value with a specific size
-struct TensorType : public Type {
+struct TORCH_API TensorType : public Type {
   friend struct Type;
   TensorType(const at::Tensor& tensor)
     : Type(TypeKind::TensorType)
@@ -185,7 +186,7 @@ struct TensorType : public Type {
   std::vector<int64_t> strides_;
 };
 
-struct ListType : public Type {
+struct TORCH_API ListType : public Type {
   friend struct Type;
   static const TypeKind Kind = TypeKind::ListType;
   ListType(TypePtr elem)
@@ -211,7 +212,7 @@ struct ListType : public Type {
   TypePtr elem;
 };
 
-struct TupleType : public Type {
+struct TORCH_API TupleType : public Type {
   friend struct Type;
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
@@ -268,7 +269,7 @@ struct TupleType : public Type {
 };
 
 // This node represents a Python number value
-struct NumberType : public Type {
+struct TORCH_API NumberType : public Type {
   NumberType()
   : Type(TypeKind::NumberType) {}
   bool operator==(const Type& rhs) const override {
@@ -283,7 +284,7 @@ struct NumberType : public Type {
 };
 
 // This node represents a Python float number value
-struct FloatType : public Type {
+struct TORCH_API FloatType : public Type {
   FloatType()
   : Type(TypeKind::FloatType) {}
   bool operator==(const Type& rhs) const override {
@@ -301,7 +302,7 @@ struct FloatType : public Type {
 };
 
 // This node represents a Python int number value
-struct IntType : public Type {
+struct TORCH_API IntType : public Type {
   IntType()
   : Type(TypeKind::IntType) {}
   bool operator==(const Type& rhs) const override {
@@ -319,6 +320,6 @@ struct IntType : public Type {
 };
 
 
-std::ostream& operator<<(std::ostream & out, const Type & t);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
 
 }} // namespace torch::jit
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
index 0487e5f7fe077..760e86b8280bb 100644
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@@ -2,6 +2,7 @@
 
 #include "torch/csrc/onnx/onnx.npb.h"
 #include "torch/csrc/assertions.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
 
 #include <pb_encode.h>
 #include <ATen/ATen.h>
@@ -417,7 +418,7 @@ class ModelProto : public MicroProto<onnx_ModelProto> {
     opset_import.emplace_back(ptr);
     return ptr;
   }
-  void dump(std::ostream& stream, size_t indent = 0);
+  TORCH_API void dump(std::ostream& stream, size_t indent = 0);
   std::string prettyPrint() {
     std::stringstream ss;
     dump(ss, 0);