forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
224 changed files
with
3,510 additions
and
2,200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
#include <ATen/optional.h> | ||
#include <ATen/Backtrace.h> | ||
|
||
#include <functional> | ||
#include <memory> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,110 +0,0 @@ | ||
#pragma once | ||
|
||
#include <ATen/ArrayRef.h> | ||
#include <ATen/CUDAStream.h> | ||
#include <ATen/Context.h> | ||
#include <ATen/DeviceGuard.h> | ||
|
||
#include <cstddef> | ||
#include <vector> | ||
|
||
namespace at { | ||
|
||
/// A variant of `DeviceGuard` that augments it with an understanding of CUDA | ||
/// streams. This guard can not only set and reset the current CUDA device, but | ||
/// also set and reset the current CUDA stream. It is important to note that | ||
/// because a CUDA stream is intrinsically associated with the CUDA device to | ||
/// which it is bound, setting the CUDA stream *also* sets the current CUDA | ||
/// device to that of the stream. | ||
struct CUDAGuard { | ||
/// Default constructor, does nothing and causes no change in the current | ||
/// stream or device until `set_stream` or `set_device` is called. | ||
CUDAGuard() = default; | ||
|
||
/// Sets the CUDA stream and its associated device as the current one (calls | ||
/// `set_stream`). | ||
explicit CUDAGuard(const CUDAStream& stream) { | ||
set_stream(stream); | ||
} | ||
|
||
/// Calls `set_device` with the given index. | ||
explicit CUDAGuard(int32_t device) { | ||
set_device(device); | ||
} | ||
|
||
CUDAGuard(const CUDAGuard&) = delete; | ||
CUDAGuard& operator=(const CUDAGuard&) = delete; | ||
|
||
/// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The | ||
/// moved-from `CUDAGuard` is modified such that its destruction has no | ||
/// effect (does not reset the stream or device). | ||
CUDAGuard(CUDAGuard&& other) noexcept = default; | ||
|
||
/// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The | ||
/// moved-from `CUDAGuard` is modified such that its destruction has no | ||
/// effect (does not reset the stream or device). | ||
CUDAGuard& operator=(CUDAGuard&& other) { | ||
device_guard_ = std::move(other.device_guard_); | ||
original_streams_ = std::move(other.original_streams_); | ||
other.original_streams_.clear(); | ||
return *this; | ||
} | ||
|
||
/// Resets the CUDA stream on each device to the one that was active upon | ||
/// construction. | ||
~CUDAGuard() { | ||
if (!original_streams_.empty()) { | ||
for (size_t device = 0; device < original_streams_.size(); ++device) { | ||
globalContext().uncheckedSetCurrentCUDAStreamOnDevice( | ||
device, original_streams_[device]); | ||
} | ||
} | ||
} | ||
|
||
/// Sets the current CUDA device to the device associated with the given | ||
/// stream, and then sets the current stream on that device to the one given. | ||
void set_stream(const CUDAStream& stream) { | ||
device_guard_.set_index(stream.device()); | ||
// If we haven't stored the current stream yet, store it now. | ||
if (original_streams_.empty()) { | ||
const size_t device_count = globalContext().getNumGPUs(); | ||
original_streams_.reserve(device_count); | ||
for (size_t device = 0; device < device_count; ++device) { | ||
original_streams_.push_back( | ||
globalContext().getCurrentCUDAStreamOnDevice(device)); | ||
} | ||
} | ||
globalContext().setCurrentCUDAStreamOnDevice( | ||
device_guard_.last_index(), stream); | ||
} | ||
|
||
/// Sets the CUDA device to the given one. | ||
void set_device(int32_t device) { | ||
device_guard_.set_index(device); | ||
} | ||
|
||
/// Returns the CUDA streams that were active in the first call to | ||
/// `set_stream`. If there was no such call, the returned container is | ||
/// empty. | ||
ArrayRef<CUDAStream> original_streams() const noexcept { | ||
return original_streams_; | ||
} | ||
|
||
/// Returns the device that was set upon construction of the guard. | ||
int32_t original_device() const noexcept { | ||
return device_guard_.original_index(); | ||
} | ||
|
||
/// Returns the last device that was set via `set_device`, if any. | ||
int32_t last_device() const noexcept { | ||
return device_guard_.last_index(); | ||
} | ||
|
||
private: | ||
/// The guard for the current device. | ||
DeviceGuard device_guard_; | ||
/// The original streams that were active on all devices. | ||
std::vector<CUDAStream> original_streams_; | ||
}; | ||
|
||
} // namespace at | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,197 +0,0 @@ | ||
#include "ATen/CUDAStream.h" | ||
#include "ATen/Error.h" | ||
#include "ATen/detail/CUDAHooksInterface.h" | ||
|
||
#include <mutex> | ||
|
||
// Internal implementation is entirely hidden | ||
struct CUDAStreamInternals { | ||
bool is_destructible; | ||
std::atomic<int> refcount; | ||
int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t | ||
cudaStream_t stream; | ||
}; | ||
|
||
namespace at { | ||
|
||
namespace detail { | ||
|
||
/* | ||
* Stream state | ||
*/ | ||
static constexpr cudaStream_t DEFAULT_STREAM = 0; | ||
|
||
static std::once_flag init_flag; | ||
static int64_t num_gpus; | ||
static CUDAStreamInternals* default_streams; | ||
static thread_local CUDAStreamInternals** current_streams = nullptr; | ||
|
||
// Creates a(n indestructible) default stream for each device | ||
// Note: the default stream on each device is signified by a zero | ||
// value for the pointer, and so is not actually created as usual. | ||
// In particular, we don't need to switch devices when creating the | ||
// streams. | ||
static void initDefaultCUDAStreams() { | ||
num_gpus = getCUDAHooks().getNumGPUs(); | ||
default_streams = (CUDAStreamInternals*) malloc(num_gpus * sizeof(CUDAStreamInternals)); | ||
for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { | ||
default_streams[i].is_destructible = false; | ||
default_streams[i].refcount = 0; | ||
default_streams[i].device = i; | ||
default_streams[i].stream = DEFAULT_STREAM; | ||
} | ||
} | ||
|
||
// Init front-end to ensure initialization only occurs once | ||
static void initCUDAStreamsOnce() { | ||
// Inits default streams (once, globally) | ||
std::call_once(init_flag, initDefaultCUDAStreams); | ||
|
||
// Inits current streams (thread local) to default streams | ||
if (current_streams) return; | ||
current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*)); | ||
for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { | ||
current_streams[i] = &default_streams[i]; | ||
} | ||
} | ||
|
||
/* | ||
* Pointer-based stream API | ||
*/ | ||
|
||
// Helper to return the current device | ||
static inline int64_t current_device() { | ||
int cur_device; | ||
DynamicCUDAInterface::get_device(&cur_device); | ||
return cur_device; | ||
} | ||
|
||
// Helper to verify the GPU index is valid | ||
static inline void check_gpu(int64_t device) { | ||
AT_ASSERT(device >= 0 && device < num_gpus); | ||
} | ||
|
||
CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) { | ||
initCUDAStreamsOnce(); | ||
check_gpu(device); | ||
return &default_streams[device]; | ||
} | ||
CUDAStreamInternals* CUDAStream_getDefaultStream() { | ||
return CUDAStream_getDefaultStreamOnDevice(current_device()); | ||
} | ||
|
||
// Creates (and retains) and new cuda stream | ||
CUDAStreamInternals* CUDAStream_createAndRetainWithOptions(int32_t flags, int32_t priority) { | ||
CUDAStreamInternals* internals = (CUDAStreamInternals*) malloc(sizeof(CUDAStreamInternals)); | ||
internals->is_destructible = true; | ||
internals->refcount = 1; | ||
internals->device = current_device(); | ||
DynamicCUDAInterface::cuda_stream_create_with_priority(&internals->stream, flags, priority); | ||
return internals; | ||
} | ||
|
||
// Note: despite not being "unsafe," is using these methods in a multithreaded | ||
// environment then the caller must be sure that streams are valid | ||
// when they're requested. These methods will throw an error if an | ||
// invalid stream is requested. | ||
CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) { | ||
initCUDAStreamsOnce(); | ||
check_gpu(device); | ||
auto cur = current_streams[device]; | ||
AT_ASSERT(CUDAStream_retain(cur)); | ||
return cur; | ||
} | ||
CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() { | ||
return CUDAStream_getAndRetainCurrentStreamOnDevice(current_device()); | ||
} | ||
|
||
// Note: these unsafe methods do not retain the stream before returning it. | ||
// This is unsafe behavior and these methods SHOULD NOT BE USED. | ||
// They are here only for legacy compatibility. | ||
CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device) { | ||
initCUDAStreamsOnce(); | ||
check_gpu(device); | ||
return current_streams[device]; | ||
} | ||
CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe() { | ||
return CUDAStream_getCurrentStreamOnDeviceUnsafe(current_device()); | ||
} | ||
|
||
void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { | ||
initCUDAStreamsOnce(); | ||
check_gpu(device); | ||
AT_ASSERT(ptr); | ||
AT_ASSERT(ptr->device == device); | ||
AT_ASSERT(CUDAStream_retain(ptr)); | ||
|
||
CUDAStream_free(current_streams[device]); | ||
current_streams[device] = ptr; | ||
} | ||
|
||
void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { | ||
initCUDAStreamsOnce(); | ||
CUDAStream_uncheckedFree(current_streams[device]); | ||
current_streams[device] = ptr; | ||
} | ||
|
||
void CUDAStream_setStream(CUDAStreamInternals* ptr) { | ||
CUDAStream_setStreamOnDevice(current_device(), ptr); | ||
} | ||
|
||
// Getters | ||
cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) { | ||
AT_ASSERT(ptr); | ||
return ptr->stream; | ||
} | ||
|
||
int64_t CUDAStream_device(CUDAStreamInternals* ptr) { | ||
AT_ASSERT(ptr); | ||
return ptr->device; | ||
} | ||
|
||
// Memory management | ||
// Note: only destructible (non-default) streams are ref counted | ||
bool CUDAStream_retain(CUDAStreamInternals* ptr) { | ||
AT_ASSERT(ptr); | ||
if (ptr->is_destructible) return(++ptr->refcount > 1); | ||
return true; | ||
} | ||
|
||
void CUDAStream_free(CUDAStreamInternals*& ptr) { | ||
if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { | ||
AT_ASSERT(ptr->refcount == 0); | ||
DynamicCUDAInterface::cuda_stream_destroy(ptr->stream); | ||
free(ptr); | ||
ptr = nullptr; | ||
} | ||
} | ||
void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) { | ||
if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { | ||
DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream); | ||
free(ptr); | ||
ptr = nullptr; | ||
} | ||
} | ||
|
||
} // namespace detail | ||
|
||
/* | ||
* CUDAStream functions | ||
*/ | ||
|
||
// Copy constructor | ||
CUDAStream::CUDAStream(const CUDAStream& other) { | ||
AT_ASSERT(other.internals_); | ||
AT_ASSERT(detail::CUDAStream_retain(other.internals_)); | ||
|
||
internals_ = other.internals_; | ||
} | ||
|
||
// Move constructor | ||
CUDAStream::CUDAStream(CUDAStream&& other) { | ||
AT_ASSERT(other.internals_); | ||
|
||
std::swap(internals_, other.internals_); | ||
} | ||
|
||
} // namespace at | ||
Oops, something went wrong.