forked from horovod/horovod
/
adapter.cc
158 lines (136 loc) · 4.84 KB
/
adapter.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include <TH/TH.h>
#if HAVE_CUDA
#include <THC/THC.h>
#endif
#include "adapter.h"
#include "cuda_util.h"
#include "tensor_util.h"
namespace horovod {
namespace torch {
// This class intentionally does not have destructor at the moment.
//
// Unfortunately, by the time this destructor would be called in normal
// circumstances (application shutdown), CUDA context would already be destroyed
// and cudaFree() operations would print nasty errors in the log - in a pretty
// normal termination scenario.
//
// If we add functionality to terminate Horovod without terminating the
// application, we should revisit this logic.
TorchPersistentBuffer::TorchPersistentBuffer(int device, int64_t size)
: device_(device) {
with_device device_context(device_);
if (device_ == CPU_DEVICE_ID) {
buffer_ = new char[size];
} else {
#if HAVE_CUDA
THCudaCheck(THCudaMalloc(state, (void**)&buffer_, size));
#else
throw std::logic_error("Internal error. Requested TorchPersistentBuffer "
"with GPU device but not compiled with CUDA.");
#endif
}
}
const void*
TorchPersistentBuffer::AccessData(std::shared_ptr<OpContext> context) const {
return buffer_;
}
template <class T> TorchTensor<T>::TorchTensor(T* tensor) : tensor_(tensor) {}
template <class T> const MPIDataType TorchTensor<T>::dtype() const {
return TensorUtil::GetDType<T>();
}
template <class T> const TensorShape TorchTensor<T>::shape() const {
auto shape = TensorUtil::GetShape(tensor_);
if (shape.dims() == 0) {
// Tensor with empty shape is a Tensor with no values in PyTorch, unlike a
// constant in TensorFlow. So, we inject a dummy zero dimension to make sure
// that the number-of-elements calculation is correct.
shape.AddDim(0);
}
return shape;
}
template <class T> const void* TorchTensor<T>::data() const {
return TensorUtil::GetData(tensor_);
}
template <class T> int64_t TorchTensor<T>::size() const {
return TensorUtil::GetSize(tensor_);
}
template <class T>
TorchTemporaryBuffer<T>::TorchTemporaryBuffer(int device)
: TorchTensor<T>(nullptr) {
this->tensor_ = TensorUtil::New<T>(device);
}
template <class T> TorchTemporaryBuffer<T>::~TorchTemporaryBuffer() {
TensorUtil::Free(this->tensor_);
}
template <class T> T* TorchTemporaryBuffer<T>::tensor() const {
return this->tensor_;
}
template <class T>
TorchOpContext<T>::TorchOpContext(int device, T* output)
: device_(device), output_(output) {}
template <class T>
Status TorchOpContext<T>::AllocatePersistent(
int64_t size, std::shared_ptr<PersistentBuffer>* tensor) {
// Allocation errors are handled using PyTorch exceptions.
*tensor = std::make_shared<TorchPersistentBuffer>(device_, size);
return Status::OK();
}
template <class T>
Status TorchOpContext<T>::AllocateOutput(TensorShape shape,
std::shared_ptr<Tensor>* tensor) {
int64_t* shape_array = new int64_t[shape.dims()];
for (int idx = 0; idx < shape.dims(); idx++) {
shape_array[idx] = shape.dim_size(idx);
}
TensorUtil::ResizeNd(output_, shape.dims(), shape_array, nullptr);
delete[] shape_array;
*tensor = std::make_shared<TorchTensor<T>>(output_);
return Status::OK();
}
template <class T> Framework TorchOpContext<T>::framework() const {
return Framework::PYTORCH;
}
void ThrowIfError(Status status) {
switch (status.type()) {
case StatusType::OK:
return;
case StatusType::PRECONDITION_ERROR:
throw std::logic_error(status.reason());
case StatusType::ABORTED:
throw std::runtime_error(status.reason());
default: // Includes UNKNOWN_ERROR
throw std::runtime_error(status.reason());
}
}
ADAPTER_DEFINE_TYPE(THByteTensor)
ADAPTER_DEFINE_TYPE(THCharTensor)
ADAPTER_DEFINE_TYPE(THShortTensor)
ADAPTER_DEFINE_TYPE(THIntTensor)
ADAPTER_DEFINE_TYPE(THLongTensor)
ADAPTER_DEFINE_TYPE(THFloatTensor)
ADAPTER_DEFINE_TYPE(THDoubleTensor)
#if HAVE_CUDA
ADAPTER_DEFINE_TYPE(THCudaByteTensor)
ADAPTER_DEFINE_TYPE(THCudaCharTensor)
ADAPTER_DEFINE_TYPE(THCudaShortTensor)
ADAPTER_DEFINE_TYPE(THCudaIntTensor)
ADAPTER_DEFINE_TYPE(THCudaLongTensor)
ADAPTER_DEFINE_TYPE(THCudaTensor)
ADAPTER_DEFINE_TYPE(THCudaDoubleTensor)
#endif
} // namespace torch
} // namespace horovod