diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h index 71959994e0044..5546e51ae6400 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include // CUDA headers #include @@ -21,19 +23,22 @@ namespace cms { const char* cmd, const char* error, const char* message, - const char* description = nullptr) { + std::string_view description = std::string_view()) { std::ostringstream out; out << "\n"; out << file << ", line " << line << ":\n"; out << "cudaCheck(" << cmd << ");\n"; out << error << ": " << message << "\n"; - if (description) + if (!description.empty()) out << description << "\n"; throw std::runtime_error(out.str()); } - inline bool cudaCheck_( - const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) { + inline bool cudaCheck_(const char* file, + int line, + const char* cmd, + CUresult result, + std::string_view description = std::string_view()) { if (LIKELY(result == CUDA_SUCCESS)) return true; @@ -45,8 +50,11 @@ namespace cms { return false; } - inline bool cudaCheck_( - const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) { + inline bool cudaCheck_(const char* file, + int line, + const char* cmd, + cudaError_t result, + std::string_view description = std::string_view()) { if (LIKELY(result == cudaSuccess)) return true; @@ -55,7 +63,6 @@ namespace cms { abortOnCudaError(file, line, cmd, error, message, description); return false; } - } // namespace cuda } // namespace cms diff --git a/HeterogeneousCore/SonicCore/src/SonicClientBase.cc b/HeterogeneousCore/SonicCore/src/SonicClientBase.cc index bc8a551453334..42b64fb7e100a 100644 --- a/HeterogeneousCore/SonicCore/src/SonicClientBase.cc +++ b/HeterogeneousCore/SonicCore/src/SonicClientBase.cc @@ -73,6 +73,10 @@ void SonicClientBase::finish(bool success, std::exception_ptr eptr) { holder_.reset(); } else if (eptr) std::rethrow_exception(eptr); + + //reset client data now (usually done at end of produce()) + if (eptr) + reset(); } void SonicClientBase::fillBasePSetDescription(edm::ParameterSetDescription& desc, bool allowRetry) { diff --git a/HeterogeneousCore/SonicTriton/BuildFile.xml b/HeterogeneousCore/SonicTriton/BuildFile.xml index 8efadafbe2869..16052f4119daa 100644 --- a/HeterogeneousCore/SonicTriton/BuildFile.xml +++ b/HeterogeneousCore/SonicTriton/BuildFile.xml @@ -4,8 +4,12 @@ + + + + diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md index 6dbbb635caaab..f4f6af7f14a76 100644 --- a/HeterogeneousCore/SonicTriton/README.md +++ b/HeterogeneousCore/SonicTriton/README.md @@ -30,6 +30,12 @@ The model information from the server can be printed by enabling `verbose` outpu * `preferredServer`: name of preferred server, for testing (see [Services](#services) below) * `timeout`: maximum allowed time for a request * `outputs`: optional, specify which output(s) the server should send +* `verbose`: enable verbose printouts (default: false) +* `useSharedMemory`: enable use of shared memory (see [below](#shared-memory)) with local servers (default: true) + +The batch size should be set using the client accessor, in order to ensure a consistent value across all inputs: +* `setBatchSize()`: set a new batch size + * some models may not support batching Useful `TritonData` accessors include: * `variableDims()`: return true if any variable dimensions @@ -39,8 +45,6 @@ Useful `TritonData` accessors include: * `byteSize()`: return number of bytes for data type * `dname()`: return name of data type * `batchSize()`: return current batch size -* `setBatchSize()`: set a new batch size - * some models may not support batching To update the `TritonData` shape in the variable-dimension case: * `setShape(const std::vector& newShape)`: update all (variable) dimensions with values provided in `newShape` @@ -49,9 +53,28 @@ To update the `TritonData` shape in the variable-dimension case: There are specific local input and output containers that should be used in producers. Here, `T` is a primitive type, and the two aliases listed below are passed to `TritonInputData::toServer()` and returned by `TritonOutputData::fromServer()`, respectively: -* `TritonInput = std::vector>` +* `TritonInputContainer = std::shared_ptr> = std::shared_ptr>>` * `TritonOutput = std::vector>` +The `TritonInputContainer` object should be created using the helper function described below. +It expects one vector per batch entry (i.e. the size of the outer vector is the batch size). +Therefore, it is best to call `TritonClient::setBatchSize()`, if necessary, before calling the helper. +It will also reserve the expected size of the input in each inner vector (by default), +if the concrete shape is available (i.e. `setShape()` was already called, if the input has variable dimensions). +* `allocate()`: return a `TritonInputContainer` properly allocated for the batch and input sizes + +### Shared memory + +If the local fallback server (see [Services](#services) below) is in use, +input and output data can be transferred via shared memory rather than gRPC. +Both CPU and GPU (CUDA) shared memory are supported. +This is more efficient for some algorithms; +if shared memory is not more efficient for an algorithm, it can be disabled in the Python configuration for the client. + +For outputs, shared memory can only be used if the batch size and concrete shape are known in advance, +because the shared memory region for the output must be registered before the inference call is made. +As with the inputs, this is handled automatically, and the use of shared memory can be disabled if desired. + ## Modules SONIC Triton supports producers, filters, and analyzers. @@ -71,7 +94,7 @@ If an `edm::GlobalCache` of type `T` is needed, there are two changes: In a SONIC Triton producer, the basic flow should follow this pattern: 1. `acquire()`: a. access input object(s) from `TritonInputMap` - b. allocate input data using `std::make_shared>()` + b. allocate input data using `allocate()` c. fill input data d. set input shape(s) (optional, only if any variable dimensions) e. convert using `toServer()` function of input object(s) diff --git a/HeterogeneousCore/SonicTriton/interface/TritonClient.h b/HeterogeneousCore/SonicTriton/interface/TritonClient.h index ccda4a3c6ec6a..61f27bddd808c 100644 --- a/HeterogeneousCore/SonicTriton/interface/TritonClient.h +++ b/HeterogeneousCore/SonicTriton/interface/TritonClient.h @@ -5,6 +5,7 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "HeterogeneousCore/SonicCore/interface/SonicClient.h" #include "HeterogeneousCore/SonicTriton/interface/TritonData.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonService.h" #include #include @@ -31,20 +32,28 @@ class TritonClient : public SonicClient { //constructor TritonClient(const edm::ParameterSet& params, const std::string& debugName); + //destructor + ~TritonClient() override; + //accessors unsigned batchSize() const { return batchSize_; } bool verbose() const { return verbose_; } + bool useSharedMemory() const { return useSharedMemory_; } + void setUseSharedMemory(bool useShm) { useSharedMemory_ = useShm; } bool setBatchSize(unsigned bsize); void reset() override; + bool noBatch() const { return noBatch_; } + TritonServerType serverType() const { return serverType_; } //for fillDescriptions static void fillPSetDescription(edm::ParameterSetDescription& iDesc); protected: - //helper - bool getResults(std::shared_ptr results); - + //helpers + void getResults(std::shared_ptr results); void evaluate() override; + template + bool handle_exception(F&& call); void reportServerSideStats(const ServerSideStats& stats) const; ServerSideStats summarizeServerStats(const inference::ModelStatistics& start_status, @@ -57,6 +66,8 @@ class TritonClient : public SonicClient { unsigned batchSize_; bool noBatch_; bool verbose_; + bool useSharedMemory_; + TritonServerType serverType_; //IO pointers for triton std::vector inputsTriton_; @@ -65,6 +76,13 @@ class TritonClient : public SonicClient { std::unique_ptr client_; //stores timeout, model name and version nvidia::inferenceserver::client::InferOptions options_; + +private: + friend TritonInputData; + friend TritonOutputData; + + //private accessors only used by data + auto client() { return client_.get(); } }; #endif diff --git a/HeterogeneousCore/SonicTriton/interface/TritonData.h b/HeterogeneousCore/SonicTriton/interface/TritonData.h index 50808de4a1216..2f5f6d8760bc6 100644 --- a/HeterogeneousCore/SonicTriton/interface/TritonData.h +++ b/HeterogeneousCore/SonicTriton/interface/TritonData.h @@ -10,13 +10,23 @@ #include #include #include -#include +#include #include "grpc_client.h" #include "grpc_service.pb.h" //forward declaration class TritonClient; +template +class TritonMemResource; +template +class TritonHeapResource; +template +class TritonCpuShmResource; +#ifdef TRITON_ENABLE_GPU +template +class TritonGpuShmResource; +#endif //aliases for local input and output types template @@ -24,6 +34,10 @@ using TritonInput = std::vector>; template using TritonOutput = std::vector>; +//other useful typdefs +template +using TritonInputContainer = std::shared_ptr>; + //store all the info needed for triton input and output template class TritonData { @@ -34,15 +48,18 @@ class TritonData { using ShapeView = edm::Span; //constructor - TritonData(const std::string& name, const TensorMetadata& model_info, bool noBatch); + TritonData(const std::string& name, const TensorMetadata& model_info, TritonClient* client, const std::string& pid); //some members can be modified - bool setShape(const ShapeType& newShape) { return setShape(newShape, true); } - bool setShape(unsigned loc, int64_t val) { return setShape(loc, val, true); } + void setShape(const ShapeType& newShape); + void setShape(unsigned loc, int64_t val); //io accessors template - void toServer(std::shared_ptr> ptr); + TritonInputContainer
allocate(bool reserve = true); + template + void toServer(TritonInputContainer
ptr); + void prepare(); template TritonOutput
fromServer() const; @@ -60,14 +77,23 @@ class TritonData { private: friend class TritonClient; + friend class TritonMemResource; + friend class TritonHeapResource; + friend class TritonCpuShmResource; +#ifdef TRITON_ENABLE_GPU + friend class TritonGpuShmResource; +#endif - //private accessors only used by client - bool setShape(const ShapeType& newShape, bool canThrow); - bool setShape(unsigned loc, int64_t val, bool canThrow); + //private accessors only used internally or by client + unsigned fullLoc(unsigned loc) const { return loc + (noBatch_ ? 0 : 1); } void setBatchSize(unsigned bsize); void reset(); void setResult(std::shared_ptr result) { result_ = result; } IO* data() { return data_.get(); } + void updateMem(size_t size); + void computeSizes(); + void resetSizes(); + nvidia::inferenceserver::client::InferenceServerGrpcClient* client(); //helpers bool anyNeg(const ShapeView& vec) const { @@ -76,11 +102,20 @@ class TritonData { int64_t dimProduct(const ShapeView& vec) const { return std::accumulate(vec.begin(), vec.end(), 1, std::multiplies()); } - void createObject(IO** ioptr) const; + void createObject(IO** ioptr); + //generates a unique id number for each instance of the class + unsigned uid() const { + static std::atomic uid{0}; + return ++uid; + } + std::string xput() const; //members std::string name_; std::shared_ptr data_; + TritonClient* client_; + bool useShm_; + std::string shmName_; const ShapeType dims_; bool noBatch_; unsigned batchSize_; @@ -91,7 +126,11 @@ class TritonData { std::string dname_; inference::DataType dtype_; int64_t byteSize_; - std::any holder_; + size_t sizeShape_; + size_t byteSizePerBatch_; + size_t totalByteSize_; + std::shared_ptr holder_; + std::shared_ptr> memResource_; std::shared_ptr result_; }; @@ -102,9 +141,18 @@ using TritonOutputMap = std::unordered_map; //avoid "explicit specialization after instantiation" error template <> +std::string TritonInputData::xput() const; +template <> +std::string TritonOutputData::xput() const; +template <> +template +TritonInputContainer
TritonInputData::allocate(bool reserve); +template <> template void TritonInputData::toServer(std::shared_ptr> ptr); template <> +void TritonOutputData::prepare(); +template <> template TritonOutput
TritonOutputData::fromServer() const; template <> @@ -112,9 +160,9 @@ void TritonInputData::reset(); template <> void TritonOutputData::reset(); template <> -void TritonInputData::createObject(nvidia::inferenceserver::client::InferInput** ioptr) const; +void TritonInputData::createObject(nvidia::inferenceserver::client::InferInput** ioptr); template <> -void TritonOutputData::createObject(nvidia::inferenceserver::client::InferRequestedOutput** ioptr) const; +void TritonOutputData::createObject(nvidia::inferenceserver::client::InferRequestedOutput** ioptr); //explicit template instantiation declarations extern template class TritonData; diff --git a/HeterogeneousCore/SonicTriton/interface/TritonException.h b/HeterogeneousCore/SonicTriton/interface/TritonException.h new file mode 100644 index 0000000000000..418a98fc1bd8e --- /dev/null +++ b/HeterogeneousCore/SonicTriton/interface/TritonException.h @@ -0,0 +1,14 @@ +#ifndef HeterogeneousCore_SonicTriton_TritonException +#define HeterogeneousCore_SonicTriton_TritonException + +#include "FWCore/Utilities/interface/Exception.h" + +#include + +class TritonException : public cms::Exception { +public: + explicit TritonException(std::string const& aCategory); + void convertToWarning() const; +}; + +#endif diff --git a/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h b/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h new file mode 100644 index 0000000000000..80c810752bfb6 --- /dev/null +++ b/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h @@ -0,0 +1,99 @@ +#ifndef HeterogeneousCore_SonicTriton_TritonMemResource +#define HeterogeneousCore_SonicTriton_TritonMemResource + +#include +#include + +#include "grpc_client.h" + +//forward declaration +template +class TritonData; + +//base class for memory operations +template +class TritonMemResource { +public: + TritonMemResource(TritonData* data, const std::string& name, size_t size); + virtual ~TritonMemResource() {} + uint8_t* addr() { return addr_; } + size_t size() const { return size_; } + virtual void close() {} + //used for input + virtual void copyInput(const void* values, size_t offset) {} + //used for output + virtual const uint8_t* copyOutput() { return nullptr; } + virtual void set(); + +protected: + //member variables + TritonData* data_; + std::string name_; + size_t size_; + uint8_t* addr_; + bool closed_; +}; + +template +class TritonHeapResource : public TritonMemResource { +public: + TritonHeapResource(TritonData* data, const std::string& name, size_t size); + ~TritonHeapResource() override {} + void copyInput(const void* values, size_t offset) override {} + const uint8_t* copyOutput() override { return nullptr; } + void set() override {} +}; + +template +class TritonCpuShmResource : public TritonMemResource { +public: + TritonCpuShmResource(TritonData* data, const std::string& name, size_t size); + ~TritonCpuShmResource() override; + void close() override; + void copyInput(const void* values, size_t offset) override {} + const uint8_t* copyOutput() override { return nullptr; } +}; + +using TritonInputHeapResource = TritonHeapResource; +using TritonInputCpuShmResource = TritonCpuShmResource; +using TritonOutputHeapResource = TritonHeapResource; +using TritonOutputCpuShmResource = TritonCpuShmResource; + +//avoid "explicit specialization after instantiation" error +template <> +void TritonInputHeapResource::copyInput(const void* values, size_t offset); +template <> +void TritonInputCpuShmResource::copyInput(const void* values, size_t offset); +template <> +const uint8_t* TritonOutputHeapResource::copyOutput(); +template <> +const uint8_t* TritonOutputCpuShmResource::copyOutput(); + +#ifdef TRITON_ENABLE_GPU +#include "cuda_runtime_api.h" + +template +class TritonGpuShmResource : public TritonMemResource { +public: + TritonGpuShmResource(TritonData* data, const std::string& name, size_t size); + ~TritonGpuShmResource() override; + void close() override; + void copyInput(const void* values, size_t offset) override {} + const uint8_t* copyOutput() override { return nullptr; } + +protected: + int deviceId_; + std::shared_ptr handle_; +}; + +using TritonInputGpuShmResource = TritonGpuShmResource; +using TritonOutputGpuShmResource = TritonGpuShmResource; + +//avoid "explicit specialization after instantiation" error +template <> +void TritonInputGpuShmResource::copyInput(const void* values, size_t offset); +template <> +const uint8_t* TritonOutputGpuShmResource::copyOutput(); +#endif + +#endif diff --git a/HeterogeneousCore/SonicTriton/interface/TritonService.h b/HeterogeneousCore/SonicTriton/interface/TritonService.h index 24232b6894ae1..9adc8d634c7d2 100644 --- a/HeterogeneousCore/SonicTriton/interface/TritonService.h +++ b/HeterogeneousCore/SonicTriton/interface/TritonService.h @@ -19,6 +19,8 @@ namespace edm { class ModuleDescription; } // namespace edm +enum class TritonServerType { Remote = 0, LocalCPU = 1, LocalGPU = 2 }; + class TritonService { public: //classes and defs @@ -79,7 +81,9 @@ class TritonService { //accessors void addModel(const std::string& modelName, const std::string& path); - std::pair serverAddress(const std::string& model, const std::string& preferred = "") const; + std::pair serverAddress(const std::string& model, + const std::string& preferred = "") const; + const std::string& pid() const { return pid_; } static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); @@ -94,6 +98,7 @@ class TritonService { unsigned currentModuleId_; bool allowAddModel_; bool startedFallback_; + std::string pid_; std::unordered_map unservedModels_; //this represents a many:many:many map std::unordered_map servers_; diff --git a/HeterogeneousCore/SonicTriton/interface/triton_utils.h b/HeterogeneousCore/SonicTriton/interface/triton_utils.h index 1be6bea4e18eb..05f0a74dec286 100644 --- a/HeterogeneousCore/SonicTriton/interface/triton_utils.h +++ b/HeterogeneousCore/SonicTriton/interface/triton_utils.h @@ -19,10 +19,6 @@ namespace triton_utils { //helper to turn triton error into exception void throwIfError(const Error& err, std::string_view msg); - - //helper to turn triton error into warning - bool warnIfError(const Error& err, std::string_view msg); - } // namespace triton_utils extern template std::string triton_utils::printColl(const edm::Span::const_iterator>& coll, diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton index d9a82a1be9c49..54a5310fb8c90 100755 --- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton +++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton @@ -4,7 +4,7 @@ USEDOCKER="" GPU="" VERBOSE="" -VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-info=1" +VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1" WTIME=120 SERVER=triton_server_instance RETRIES=3 @@ -177,7 +177,7 @@ start_docker(){ done $DRYRUN $DOCKER run -d --name ${SERVER} \ - --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ + --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \ -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \ ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE } diff --git a/HeterogeneousCore/SonicTriton/src/TritonClient.cc b/HeterogeneousCore/SonicTriton/src/TritonClient.cc index f26a09392ba81..655af4ef3cdba 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonClient.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonClient.cc @@ -3,6 +3,7 @@ #include "FWCore/ServiceRegistry/interface/Service.h" #include "FWCore/Utilities/interface/Exception.h" #include "HeterogeneousCore/SonicTriton/interface/TritonClient.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonException.h" #include "HeterogeneousCore/SonicTriton/interface/TritonService.h" #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h" @@ -26,16 +27,18 @@ namespace nic = ni::client; TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& debugName) : SonicClient(params, debugName, "TritonClient"), verbose_(params.getUntrackedParameter("verbose")), + useSharedMemory_(params.getUntrackedParameter("useSharedMemory")), options_(params.getParameter("modelName")) { //get appropriate server for this model edm::Service ts; - const auto& [url, isFallbackCPU] = + const auto& [url, serverType] = ts->serverAddress(options_.model_name_, params.getUntrackedParameter("preferredServer")); + serverType_ = serverType; if (verbose_) edm::LogInfo(fullDebugName_) << "Using server: " << url; //enforce sync mode for fallback CPU server to avoid contention //todo: could enforce async mode otherwise (unless mode was specified by user?) - if (isFallbackCPU) + if (serverType_ == TritonServerType::LocalCPU) setMode(SonicMode::Sync); //connect to the server @@ -95,8 +98,9 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d inputsTriton_.reserve(nicInputs.size()); for (const auto& nicInput : nicInputs) { const auto& iname = nicInput.name(); - auto [curr_itr, success] = input_.emplace( - std::piecewise_construct, std::forward_as_tuple(iname), std::forward_as_tuple(iname, nicInput, noBatch_)); + auto [curr_itr, success] = input_.emplace(std::piecewise_construct, + std::forward_as_tuple(iname), + std::forward_as_tuple(iname, nicInput, this, ts->pid())); auto& curr_input = curr_itr->second; inputsTriton_.push_back(curr_input.data()); if (verbose_) { @@ -118,8 +122,9 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d const auto& oname = nicOutput.name(); if (!s_outputs.empty() and s_outputs.find(oname) == s_outputs.end()) continue; - auto [curr_itr, success] = output_.emplace( - std::piecewise_construct, std::forward_as_tuple(oname), std::forward_as_tuple(oname, nicOutput, noBatch_)); + auto [curr_itr, success] = output_.emplace(std::piecewise_construct, + std::forward_as_tuple(oname), + std::forward_as_tuple(oname, nicOutput, this, ts->pid())); auto& curr_output = curr_itr->second; outputsTriton_.push_back(curr_output.data()); if (verbose_) { @@ -148,6 +153,15 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d } } +TritonClient::~TritonClient() { + //by default: members of this class destroyed before members of base class + //in shared memory case, TritonMemResource (member of TritonData) unregisters from client_ in its destructor + //but input/output objects are member of base class, so destroyed after client_ (member of this class) + //therefore, clear the maps here + input_.clear(); + output_.clear(); +} + bool TritonClient::setBatchSize(unsigned bsize) { if (bsize > maxBatchSize_) { edm::LogWarning(fullDebugName_) << "Requested batch size " << bsize << " exceeds server-specified max batch size " @@ -175,22 +189,39 @@ void TritonClient::reset() { } } -bool TritonClient::getResults(std::shared_ptr results) { +template +bool TritonClient::handle_exception(F&& call) { + //caught exceptions will be propagated to edm::WaitingTaskWithArenaHolder + CMS_SA_ALLOW try { + call(); + return true; + } + //TritonExceptions are intended/expected to be recoverable, i.e. retries should be allowed + catch (TritonException& e) { + e.convertToWarning(); + finish(false); + return false; + } + //other exceptions are not: execution should stop if they are encountered + catch (...) { + finish(false, std::current_exception()); + return false; + } +} + +void TritonClient::getResults(std::shared_ptr results) { for (auto& [oname, output] : output_) { //set shape here before output becomes const if (output.variableDims()) { std::vector tmp_shape; - bool status = triton_utils::warnIfError(results->Shape(oname, &tmp_shape), - "getResults(): unable to get output shape for " + oname); - if (!status) - return status; - output.setShape(tmp_shape, false); + triton_utils::throwIfError(results->Shape(oname, &tmp_shape), + "getResults(): unable to get output shape for " + oname); + output.setShape(tmp_shape); + output.computeSizes(); } //extend lifetime output.setResult(results); } - - return true; } //default case for sync and pseudo async @@ -201,76 +232,101 @@ void TritonClient::evaluate() { return; } + //set up shared memory for output + auto success = handle_exception([&]() { + for (auto& element : output_) { + element.second.prepare(); + } + }); + if (!success) + return; + // Get the status of the server prior to the request being made. - const auto& start_status = getServerSideStatus(); + inference::ModelStatistics start_status; + success = handle_exception([&]() { + if (verbose()) + start_status = getServerSideStatus(); + }); + if (!success) + return; if (mode_ == SonicMode::Async) { //non-blocking call auto t1 = std::chrono::high_resolution_clock::now(); - bool status = triton_utils::warnIfError( - client_->AsyncInfer( - [t1, start_status, this](nic::InferResult* results) { - //get results - std::shared_ptr results_ptr(results); - bool status = triton_utils::warnIfError(results_ptr->RequestStatus(), "evaluate(): unable to get result"); - if (!status) { - finish(false); - return; - } - auto t2 = std::chrono::high_resolution_clock::now(); - - if (!debugName_.empty()) - edm::LogInfo(fullDebugName_) - << "Remote time: " << std::chrono::duration_cast(t2 - t1).count(); - - const auto& end_status = getServerSideStatus(); - - if (verbose()) { - const auto& stats = summarizeServerStats(start_status, end_status); - reportServerSideStats(stats); - } - - //check result - status = getResults(results_ptr); - - //finish - finish(status); - }, - options_, - inputsTriton_, - outputsTriton_), - "evaluate(): unable to launch async run"); - - //if AsyncRun failed, finish() wasn't called - if (!status) - finish(false); + success = handle_exception([&]() { + triton_utils::throwIfError( + client_->AsyncInfer( + [t1, start_status, this](nic::InferResult* results) { + //get results + std::shared_ptr results_ptr(results); + auto success = handle_exception([&]() { + triton_utils::throwIfError(results_ptr->RequestStatus(), "evaluate(): unable to get result"); + }); + if (!success) + return; + auto t2 = std::chrono::high_resolution_clock::now(); + + if (!debugName_.empty()) + edm::LogInfo(fullDebugName_) + << "Remote time: " << std::chrono::duration_cast(t2 - t1).count(); + + if (verbose()) { + inference::ModelStatistics end_status; + success = handle_exception([&]() { end_status = getServerSideStatus(); }); + if (!success) + return; + + const auto& stats = summarizeServerStats(start_status, end_status); + reportServerSideStats(stats); + } + + //check result + success = handle_exception([&]() { getResults(results_ptr); }); + if (!success) + return; + + //finish + finish(true); + }, + options_, + inputsTriton_, + outputsTriton_), + "evaluate(): unable to launch async run"); + }); + if (!success) + return; } else { //blocking call auto t1 = std::chrono::high_resolution_clock::now(); nic::InferResult* results; - bool status = triton_utils::warnIfError(client_->Infer(&results, options_, inputsTriton_, outputsTriton_), - "evaluate(): unable to run and/or get result"); - if (!status) { - finish(false); + success = handle_exception([&]() { + triton_utils::throwIfError(client_->Infer(&results, options_, inputsTriton_, outputsTriton_), + "evaluate(): unable to run and/or get result"); + }); + if (!success) return; - } auto t2 = std::chrono::high_resolution_clock::now(); if (!debugName_.empty()) edm::LogInfo(fullDebugName_) << "Remote time: " << std::chrono::duration_cast(t2 - t1).count(); - const auto& end_status = getServerSideStatus(); - if (verbose()) { + inference::ModelStatistics end_status; + success = handle_exception([&]() { end_status = getServerSideStatus(); }); + if (!success) + return; + const auto& stats = summarizeServerStats(start_status, end_status); reportServerSideStats(stats); } std::shared_ptr results_ptr(results); - status = getResults(results_ptr); + success = handle_exception([&]() { getResults(results_ptr); }); + if (!success) + return; - finish(status); + finish(true); } } @@ -335,11 +391,9 @@ TritonClient::ServerSideStats TritonClient::summarizeServerStats(const inference inference::ModelStatistics TritonClient::getServerSideStatus() const { if (verbose_) { inference::ModelStatisticsResponse resp; - bool success = triton_utils::warnIfError( - client_->ModelInferenceStatistics(&resp, options_.model_name_, options_.model_version_), - "getServerSideStatus(): unable to get model statistics"); - if (success) - return *(resp.model_stats().begin()); + triton_utils::throwIfError(client_->ModelInferenceStatistics(&resp, options_.model_name_, options_.model_version_), + "getServerSideStatus(): unable to get model statistics"); + return *(resp.model_stats().begin()); } return inference::ModelStatistics{}; } @@ -355,6 +409,7 @@ void TritonClient::fillPSetDescription(edm::ParameterSetDescription& iDesc) { descClient.addUntracked("preferredServer", ""); descClient.addUntracked("timeout"); descClient.addUntracked("verbose", false); + descClient.addUntracked("useSharedMemory", true); descClient.addUntracked>("outputs", {}); iDesc.add("Client", descClient); } diff --git a/HeterogeneousCore/SonicTriton/src/TritonData.cc b/HeterogeneousCore/SonicTriton/src/TritonData.cc index 258671be07691..5cd2767123da8 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonData.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonData.cc @@ -1,10 +1,11 @@ #include "HeterogeneousCore/SonicTriton/interface/TritonData.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonMemResource.h" #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "model_config.pb.h" -#include #include namespace ni = nvidia::inferenceserver; @@ -22,10 +23,17 @@ namespace nvidia { //fullShape: if batching is enabled, first entry is batch size; values can be modified //shape: view into fullShape, excluding batch size entry template -TritonData::TritonData(const std::string& name, const TritonData::TensorMetadata& model_info, bool noBatch) +TritonData::TritonData(const std::string& name, + const TritonData::TensorMetadata& model_info, + TritonClient* client, + const std::string& pid) : name_(name), + client_(client), + useShm_(client_->useSharedMemory()), + //ensure unique name for shared memory region + shmName_(useShm_ ? pid + "_" + xput() + std::to_string(uid()) : ""), dims_(model_info.shape().begin(), model_info.shape().end()), - noBatch_(noBatch), + noBatch_(client_->noBatch()), batchSize_(0), fullShape_(dims_), shape_(fullShape_.begin() + (noBatch_ ? 0 : 1), fullShape_.end()), @@ -33,7 +41,8 @@ TritonData::TritonData(const std::string& name, const TritonData::Tensor productDims_(variableDims_ ? -1 : dimProduct(shape_)), dname_(model_info.datatype()), dtype_(ni::ProtocolStringToDataType(dname_)), - byteSize_(ni::GetDataTypeByteSize(dtype_)) { + byteSize_(ni::GetDataTypeByteSize(dtype_)), + totalByteSize_(0) { //create input or output object IO* iotmp; createObject(&iotmp); @@ -41,57 +50,56 @@ TritonData::TritonData(const std::string& name, const TritonData::Tensor } template <> -void TritonInputData::createObject(nic::InferInput** ioptr) const { +void TritonInputData::createObject(nic::InferInput** ioptr) { nic::InferInput::Create(ioptr, name_, fullShape_, dname_); } template <> -void TritonOutputData::createObject(nic::InferRequestedOutput** ioptr) const { +void TritonOutputData::createObject(nic::InferRequestedOutput** ioptr) { nic::InferRequestedOutput::Create(ioptr, name_); + //another specialization for output: can't use shared memory if output size is not known + useShm_ &= !variableDims_; +} + +template <> +std::string TritonInputData::xput() const { + return "input"; +} + +template <> +std::string TritonOutputData::xput() const { + return "output"; +} + +template +nic::InferenceServerGrpcClient* TritonData::client() { + return client_->client(); } //setters template -bool TritonData::setShape(const TritonData::ShapeType& newShape, bool canThrow) { - bool result = true; +void TritonData::setShape(const TritonData::ShapeType& newShape) { for (unsigned i = 0; i < newShape.size(); ++i) { - result &= setShape(i, newShape[i], canThrow); + setShape(i, newShape[i]); } - return result; } template -bool TritonData::setShape(unsigned loc, int64_t val, bool canThrow) { - std::stringstream msg; - unsigned full_loc = loc + (noBatch_ ? 0 : 1); +void TritonData::setShape(unsigned loc, int64_t val) { + unsigned locFull = fullLoc(loc); //check boundary - if (full_loc >= fullShape_.size()) { - msg << name_ << " setShape(): dimension " << full_loc << " out of bounds (" << fullShape_.size() << ")"; - if (canThrow) - throw cms::Exception("TritonDataError") << msg.str(); - else { - edm::LogWarning("TritonDataWarning") << msg.str(); - return false; - } - } + if (locFull >= fullShape_.size()) + throw cms::Exception("TritonDataError") + << name_ << " setShape(): dimension " << locFull << " out of bounds (" << fullShape_.size() << ")"; - if (val != fullShape_[full_loc]) { - if (dims_[full_loc] == -1) { - fullShape_[full_loc] = val; - return true; - } else { - msg << name_ << " setShape(): attempt to change value of non-variable shape dimension " << loc; - if (canThrow) - throw cms::Exception("TritonDataError") << msg.str(); - else { - edm::LogWarning("TritonDataError") << msg.str(); - return false; - } - } + if (val != fullShape_[locFull]) { + if (dims_[locFull] == -1) + fullShape_[locFull] = val; + else + throw cms::Exception("TritonDataError") + << name_ << " setShape(): attempt to change value of non-variable shape dimension " << loc; } - - return true; } template @@ -101,15 +109,71 @@ void TritonData::setBatchSize(unsigned bsize) { fullShape_[0] = batchSize_; } +template +void TritonData::computeSizes() { + sizeShape_ = sizeShape(); + byteSizePerBatch_ = byteSize_ * sizeShape_; + totalByteSize_ = byteSizePerBatch_ * batchSize_; +} +template +void TritonData::resetSizes() { + sizeShape_ = 0; + byteSizePerBatch_ = 0; + totalByteSize_ = 0; +} + +//create a memory resource if none exists; +//otherwise, reuse the memory resource, resizing it if necessary +template +void TritonData::updateMem(size_t size) { + if (!memResource_ or size > memResource_->size()) { + if (useShm_ and client_->serverType() == TritonServerType::LocalCPU) { + //avoid unnecessarily throwing in destructor + if (memResource_) + memResource_->close(); + //need to destroy before constructing new instance because shared memory key will be reused + memResource_.reset(); + memResource_ = std::make_shared>(this, shmName_, size); + } +#ifdef TRITON_ENABLE_GPU + else if (useShm_ and client_->serverType() == TritonServerType::LocalGPU) { + //avoid unnecessarily throwing in destructor + if (memResource_) + memResource_->close(); + //need to destroy before constructing new instance because shared memory key will be reused + memResource_.reset(); + memResource_ = std::make_shared>(this, shmName_, size); + } +#endif + //for remote/heap, size increases don't matter + else if (!memResource_) + memResource_ = std::make_shared>(this, shmName_, size); + } +} + //io accessors template <> template -void TritonInputData::toServer(std::shared_ptr> ptr) { +TritonInputContainer
TritonInputData::allocate(bool reserve) { + //automatically creates a vector for each batch entry (if batch size known) + auto ptr = std::make_shared>(batchSize_); + if (reserve and !anyNeg(shape_)) { + computeSizes(); + for (auto& vec : *ptr) { + vec.reserve(sizeShape_); + } + } + return ptr; +} + +template <> +template +void TritonInputData::toServer(TritonInputContainer
ptr) { const auto& data_in = *ptr; //check batch size if (data_in.size() != batchSize_) { - throw cms::Exception("TritonDataError") << name_ << " input(): input vector has size " << data_in.size() + throw cms::Exception("TritonDataError") << name_ << " toServer(): input vector has size " << data_in.size() << " but specified batch size is " << batchSize_; } @@ -117,48 +181,48 @@ void TritonInputData::toServer(std::shared_ptr> ptr) { data_->SetShape(fullShape_); if (byteSize_ != sizeof(DT)) - throw cms::Exception("TritonDataError") << name_ << " input(): inconsistent byte size " << sizeof(DT) + throw cms::Exception("TritonDataError") << name_ << " toServer(): inconsistent byte size " << sizeof(DT) << " (should be " << byteSize_ << " for " << dname_ << ")"; - int64_t nInput = sizeShape(); + computeSizes(); + updateMem(totalByteSize_); for (unsigned i0 = 0; i0 < batchSize_; ++i0) { - const DT* arr = data_in[i0].data(); - triton_utils::throwIfError(data_->AppendRaw(reinterpret_cast(arr), nInput * byteSize_), - name_ + " input(): unable to set data for batch entry " + std::to_string(i0)); + memResource_->copyInput(data_in[i0].data(), i0 * byteSizePerBatch_); } + memResource_->set(); //keep input data in scope - holder_ = std::move(ptr); + holder_ = ptr; +} + +//sets up shared memory for outputs, if possible +template <> +void TritonOutputData::prepare() { + computeSizes(); + updateMem(totalByteSize_); + memResource_->set(); } template <> template TritonOutput
TritonOutputData::fromServer() const { if (!result_) { - throw cms::Exception("TritonDataError") << name_ << " output(): missing result"; + throw cms::Exception("TritonDataError") << name_ << " fromServer(): missing result"; } if (byteSize_ != sizeof(DT)) { - throw cms::Exception("TritonDataError") << name_ << " output(): inconsistent byte size " << sizeof(DT) + throw cms::Exception("TritonDataError") << name_ << " fromServer(): inconsistent byte size " << sizeof(DT) << " (should be " << byteSize_ << " for " << dname_ << ")"; } - uint64_t nOutput = sizeShape(); - TritonOutput
dataOut; - const uint8_t* r0; - size_t contentByteSize; - size_t expectedContentByteSize = nOutput * byteSize_ * batchSize_; - triton_utils::throwIfError(result_->RawData(name_, &r0, &contentByteSize), "output(): unable to get raw"); - if (contentByteSize != expectedContentByteSize) { - throw cms::Exception("TritonDataError") << name_ << " output(): unexpected content byte size " << contentByteSize - << " (expected " << expectedContentByteSize << ")"; - } - + const uint8_t* r0 = memResource_->copyOutput(); const DT* r1 = reinterpret_cast(r0); + + TritonOutput
dataOut; dataOut.reserve(batchSize_); for (unsigned i0 = 0; i0 < batchSize_; ++i0) { - auto offset = i0 * nOutput; - dataOut.emplace_back(r1 + offset, r1 + offset + nOutput); + auto offset = i0 * sizeShape_; + dataOut.emplace_back(r1 + offset, r1 + offset + sizeShape_); } return dataOut; @@ -166,20 +230,33 @@ TritonOutput
TritonOutputData::fromServer() const { template <> void TritonInputData::reset() { - data_->Reset(); holder_.reset(); + data_->Reset(); + //reset shape + if (variableDims_) { + for (unsigned i = 0; i < shape_.size(); ++i) { + unsigned locFull = fullLoc(i); + fullShape_[locFull] = dims_[locFull]; + } + } + resetSizes(); } template <> void TritonOutputData::reset() { result_.reset(); + holder_.reset(); + resetSizes(); } //explicit template instantiation declarations template class TritonData; template class TritonData; -template void TritonInputData::toServer(std::shared_ptr> data_in); -template void TritonInputData::toServer(std::shared_ptr> data_in); +template TritonInputContainer TritonInputData::allocate(bool reserve); +template TritonInputContainer TritonInputData::allocate(bool reserve); + +template void TritonInputData::toServer(TritonInputContainer data_in); +template void TritonInputData::toServer(TritonInputContainer data_in); template TritonOutput TritonOutputData::fromServer() const; diff --git a/HeterogeneousCore/SonicTriton/src/TritonException.cc b/HeterogeneousCore/SonicTriton/src/TritonException.cc new file mode 100644 index 0000000000000..ee160ffe957c2 --- /dev/null +++ b/HeterogeneousCore/SonicTriton/src/TritonException.cc @@ -0,0 +1,6 @@ +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonException.h" + +TritonException::TritonException(std::string const& aCategory) : cms::Exception(aCategory) {} + +void TritonException::convertToWarning() const { edm::LogWarning(category()) << explainSelf(); } diff --git a/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc b/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc new file mode 100644 index 0000000000000..7f1bff1f1db34 --- /dev/null +++ b/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc @@ -0,0 +1,172 @@ +#include "HeterogeneousCore/SonicTriton/interface/TritonData.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonMemResource.h" +#include "HeterogeneousCore/SonicTriton/interface/triton_utils.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include +#include +#include +#include + +namespace ni = nvidia::inferenceserver; +namespace nic = ni::client; + +template +TritonMemResource::TritonMemResource(TritonData* data, const std::string& name, size_t size) + : data_(data), name_(name), size_(size), addr_(nullptr), closed_(false) {} + +template +void TritonMemResource::set() { + triton_utils::throwIfError(data_->data_->SetSharedMemory(name_, data_->totalByteSize_, 0), + "unable to set shared memory (" + name_ + ")"); +} + +template +TritonHeapResource::TritonHeapResource(TritonData* data, const std::string& name, size_t size) + : TritonMemResource(data, name, size) {} + +template <> +void TritonInputHeapResource::copyInput(const void* values, size_t offset) { + triton_utils::throwIfError( + data_->data_->AppendRaw(reinterpret_cast(values), data_->byteSizePerBatch_), + data_->name_ + " toServer(): unable to set data for batch entry " + + std::to_string(offset / data_->byteSizePerBatch_)); +} + +template <> +const uint8_t* TritonOutputHeapResource::copyOutput() { + size_t contentByteSize; + const uint8_t* values; + triton_utils::throwIfError(data_->result_->RawData(data_->name_, &values, &contentByteSize), + data_->name_ + " fromServer(): unable to get raw"); + if (contentByteSize != data_->totalByteSize_) { + throw cms::Exception("TritonDataError") << data_->name_ << " fromServer(): unexpected content byte size " + << contentByteSize << " (expected " << data_->totalByteSize_ << ")"; + } + return values; +} + +//shared memory helpers based on: +// https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/shm_utils.cc (cpu) +// https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/simple_grpc_cudashm_client.cc (gpu) + +template +TritonCpuShmResource::TritonCpuShmResource(TritonData* data, const std::string& name, size_t size) + : TritonMemResource(data, name, size) { + //get shared memory region descriptor + int shm_fd = shm_open(this->name_.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (shm_fd == -1) + throw cms::Exception("TritonError") << "unable to get shared memory descriptor for key: " + this->name_; + + //extend shared memory object + int res = ftruncate(shm_fd, this->size_); + if (res == -1) + throw cms::Exception("TritonError") << "unable to initialize shared memory key " + this->name_ + + " to requested size: " + std::to_string(this->size_); + + //map to process address space + constexpr size_t offset(0); + this->addr_ = (uint8_t*)mmap(nullptr, this->size_, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, offset); + if (this->addr_ == MAP_FAILED) + throw cms::Exception("TritonError") << "unable to map to process address space for shared memory key: " + + this->name_; + + //close descriptor + if (::close(shm_fd) == -1) + throw cms::Exception("TritonError") << "unable to close descriptor for shared memory key: " + this->name_; + + triton_utils::throwIfError(this->data_->client()->RegisterSystemSharedMemory(this->name_, this->name_, this->size_), + "unable to register shared memory region: " + this->name_); +} + +template +TritonCpuShmResource::~TritonCpuShmResource() { + close(); +} + +template +void TritonCpuShmResource::close() { + if (this->closed_) + return; + + triton_utils::throwIfError(this->data_->client()->UnregisterSystemSharedMemory(this->name_), + "unable to unregister shared memory region: " + this->name_); + + //unmap + int tmp_fd = munmap(this->addr_, this->size_); + if (tmp_fd == -1) + throw cms::Exception("TritonError") << "unable to munmap for shared memory key: " << this->name_; + + //unlink + int shm_fd = shm_unlink(this->name_.c_str()); + if (shm_fd == -1) + throw cms::Exception("TritonError") << "unable to unlink for shared memory key: " << this->name_; + + this->closed_ = true; +} + +template <> +void TritonInputCpuShmResource::copyInput(const void* values, size_t offset) { + std::memcpy(addr_ + offset, values, data_->byteSizePerBatch_); +} + +template <> +const uint8_t* TritonOutputCpuShmResource::copyOutput() { + return addr_; +} + +template class TritonHeapResource; +template class TritonCpuShmResource; +template class TritonHeapResource; +template class TritonCpuShmResource; + +#ifdef TRITON_ENABLE_GPU +template +TritonGpuShmResource::TritonGpuShmResource(TritonData* data, const std::string& name, size_t size) + : TritonMemResource(data, name, size), deviceId_(0), handle_(std::make_shared()) { + //todo: get server device id somehow? + cudaCheck(cudaSetDevice(deviceId_), "unable to set device ID to " + std::to_string(deviceId_)); + cudaCheck(cudaMalloc((void**)&this->addr_, this->size_), "unable to allocate GPU memory for key: " + this->name_); + cudaCheck(cudaIpcGetMemHandle(handle_.get(), this->addr_), "unable to get IPC handle for key: " + this->name_); + triton_utils::throwIfError( + this->data_->client()->RegisterCudaSharedMemory(this->name_, *handle_, deviceId_, this->size_), + "unable to register CUDA shared memory region: " + this->name_); +} + +template +TritonGpuShmResource::~TritonGpuShmResource() { + close(); +} + +template +void TritonGpuShmResource::close() { + if (this->closed_) + return; + triton_utils::throwIfError(this->data_->client()->UnregisterCudaSharedMemory(this->name_), + "unable to unregister CUDA shared memory region: " + this->name_); + cudaCheck(cudaFree(this->addr_), "unable to free GPU memory for key: " + this->name_); + this->closed_ = true; +} + +template <> +void TritonInputGpuShmResource::copyInput(const void* values, size_t offset) { + cudaCheck( + cudaMemcpy(addr_ + offset, values, data_->byteSizePerBatch_, cudaMemcpyHostToDevice), + data_->name_ + " toServer(): unable to memcpy " + std::to_string(data_->byteSizePerBatch_) + " bytes to GPU"); +} + +template <> +const uint8_t* TritonOutputGpuShmResource::copyOutput() { + //copy back from gpu, keep in scope + auto ptr = std::make_shared>(data_->totalByteSize_); + cudaCheck( + cudaMemcpy(ptr->data(), addr_, data_->totalByteSize_, cudaMemcpyDeviceToHost), + data_->name_ + " fromServer(): unable to memcpy " + std::to_string(data_->totalByteSize_) + " bytes from GPU"); + data_->holder_ = ptr; + return ptr->data(); +} + +template class TritonGpuShmResource; +template class TritonGpuShmResource; +#endif diff --git a/HeterogeneousCore/SonicTriton/src/TritonService.cc b/HeterogeneousCore/SonicTriton/src/TritonService.cc index 7ad47b4eded3f..9c3c0a31f100b 100644 --- a/HeterogeneousCore/SonicTriton/src/TritonService.cc +++ b/HeterogeneousCore/SonicTriton/src/TritonService.cc @@ -58,7 +58,8 @@ TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistr fallbackOpts_(pset.getParameterSet("fallback")), currentModuleId_(0), allowAddModel_(false), - startedFallback_(false) { + startedFallback_(false), + pid_(std::to_string(::getpid())) { //module construction is assumed to be serial (correct at the time this code was written) areg.watchPreModuleConstruction(this, &TritonService::preModuleConstruction); areg.watchPostModuleConstruction(this, &TritonService::postModuleConstruction); @@ -158,8 +159,8 @@ void TritonService::preModuleDestruction(edm::ModuleDescription const& desc) { } //second return value is only true if fallback CPU server is being used -std::pair TritonService::serverAddress(const std::string& model, - const std::string& preferred) const { +std::pair TritonService::serverAddress(const std::string& model, + const std::string& preferred) const { auto mit = models_.find(model); if (mit == models_.end()) throw cms::Exception("MissingModel") << "There are no servers that provide model " << model; @@ -178,8 +179,16 @@ std::pair TritonService::serverAddress(const std::string& mod //todo: use some algorithm to select server rather than just picking arbitrarily const auto& serverInfo(servers_.find(serverName)->second); - bool isFallbackCPU = serverInfo.isFallback and !fallbackOpts_.useGPU; - return std::make_pair(serverInfo.url, isFallbackCPU); + auto serverType = TritonServerType::Remote; + if (serverInfo.isFallback) { + if (!fallbackOpts_.useGPU) + serverType = TritonServerType::LocalCPU; +#ifdef TRITON_ENABLE_GPU + else + serverType = TritonServerType::LocalGPU; +#endif + } + return std::make_pair(serverInfo.url, serverType); } void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm::ProcessContext const&) { @@ -208,7 +217,7 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm:: } //assemble server start command - std::string command("cmsTriton -P -1 -p " + std::to_string(::getpid())); + std::string command("cmsTriton -P -1 -p " + pid_); if (fallbackOpts_.debug) command += " -c"; if (fallbackOpts_.verbose) diff --git a/HeterogeneousCore/SonicTriton/src/triton_utils.cc b/HeterogeneousCore/SonicTriton/src/triton_utils.cc index eeb56a6b8895f..e4460feebef92 100644 --- a/HeterogeneousCore/SonicTriton/src/triton_utils.cc +++ b/HeterogeneousCore/SonicTriton/src/triton_utils.cc @@ -1,4 +1,5 @@ #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h" +#include "HeterogeneousCore/SonicTriton/interface/TritonException.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/Utilities/interface/Exception.h" @@ -19,15 +20,8 @@ namespace triton_utils { void throwIfError(const Error& err, std::string_view msg) { if (!err.IsOk()) - throw cms::Exception("TritonServerFailure") << msg << ": " << err; + throw TritonException("TritonFailure") << msg << (err.Message().empty() ? "" : ": " + err.Message()); } - - bool warnIfError(const Error& err, std::string_view msg) { - if (!err.IsOk()) - edm::LogWarning("TritonServerWarning") << msg << ": " << err; - return err.IsOk(); - } - } // namespace triton_utils template std::string triton_utils::printColl(const edm::Span::const_iterator>& coll, diff --git a/HeterogeneousCore/SonicTriton/test/BuildFile.xml b/HeterogeneousCore/SonicTriton/test/BuildFile.xml index 63ff99aeaa9ca..1a02afd61be06 100644 --- a/HeterogeneousCore/SonicTriton/test/BuildFile.xml +++ b/HeterogeneousCore/SonicTriton/test/BuildFile.xml @@ -1,5 +1,6 @@ - + + diff --git a/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc b/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc index 46ced5e330291..8e85a139cf6b9 100644 --- a/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc +++ b/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc @@ -15,7 +15,8 @@ class TritonGraphHelper { : nodeMin_(cfg.getParameter("nodeMin")), nodeMax_(cfg.getParameter("nodeMax")), edgeMin_(cfg.getParameter("edgeMin")), - edgeMax_(cfg.getParameter("edgeMax")) {} + edgeMax_(cfg.getParameter("edgeMax")), + brief_(cfg.getParameter("brief")) {} void makeInput(edm::Event const& iEvent, TritonInputMap& iInput) const { //get event-based seed for RNG unsigned int runNum_uint = static_cast(iEvent.id().run()); @@ -32,15 +33,13 @@ class TritonGraphHelper { //set shapes auto& input1 = iInput.at("x__0"); input1.setShape(0, nnodes); - auto data1 = std::make_shared>(1); + auto data1 = input1.allocate(); auto& vdata1 = (*data1)[0]; - vdata1.reserve(input1.sizeShape()); auto& input2 = iInput.at("edgeindex__1"); input2.setShape(1, nedges); - auto data2 = std::make_shared>(1); + auto data2 = input2.allocate(); auto& vdata2 = (*data2)[0]; - vdata2.reserve(input2.sizeShape()); //fill std::normal_distribution randx(-10, 4); @@ -62,27 +61,32 @@ class TritonGraphHelper { const auto& output1 = iOutput.begin()->second; // convert from server format const auto& tmp = output1.fromServer(); - std::stringstream msg; - for (int i = 0; i < output1.shape()[0]; ++i) { - msg << "output " << i << ": "; - for (int j = 0; j < output1.shape()[1]; ++j) { - msg << tmp[0][output1.shape()[1] * i + j] << " "; + if (brief_) + edm::LogInfo(debugName) << "output shape: " << output1.shape()[0] << ", " << output1.shape()[1]; + else { + edm::LogInfo msg(debugName); + for (int i = 0; i < output1.shape()[0]; ++i) { + msg << "output " << i << ": "; + for (int j = 0; j < output1.shape()[1]; ++j) { + msg << tmp[0][output1.shape()[1] * i + j] << " "; + } + msg << "\n"; } - msg << "\n"; } - edm::LogInfo(debugName) << msg.str(); } static void fillPSetDescription(edm::ParameterSetDescription& desc) { desc.add("nodeMin", 100); desc.add("nodeMax", 4000); desc.add("edgeMin", 8000); desc.add("edgeMax", 15000); + desc.add("brief", false); } private: //members unsigned nodeMin_, nodeMax_; unsigned edgeMin_, edgeMax_; + bool brief_; }; class TritonGraphProducer : public TritonEDProducer<> { diff --git a/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc b/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc index 4c602c659ea75..87fd2e9fad9df 100644 --- a/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc +++ b/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc @@ -10,12 +10,14 @@ #include #include #include +#include +#include class TritonImageProducer : public TritonEDProducer<> { public: explicit TritonImageProducer(edm::ParameterSet const& cfg) : TritonEDProducer<>(cfg, "TritonImageProducer"), - batchSize_(cfg.getParameter("batchSize")), + batchSize_(cfg.getParameter("batchSize")), topN_(cfg.getParameter("topN")) { //load score list std::string imageListFile(cfg.getParameter("imageList").fullPath()); @@ -30,14 +32,26 @@ class TritonImageProducer : public TritonEDProducer<> { } } void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, Input& iInput) override { - client_->setBatchSize(batchSize_); + int actualBatchSize = batchSize_; + //negative batch = generate random batch size from 1 to abs(batch) + if (batchSize_ < 0) { + //get event-based seed for RNG + unsigned int runNum_uint = static_cast(iEvent.id().run()); + unsigned int lumiNum_uint = static_cast(iEvent.id().luminosityBlock()); + unsigned int evNum_uint = static_cast(iEvent.id().event()); + std::uint32_t seed = (lumiNum_uint << 10) + (runNum_uint << 20) + evNum_uint; + std::mt19937 rng(seed); + std::uniform_int_distribution randint(1, std::abs(batchSize_)); + actualBatchSize = randint(rng); + } + + client_->setBatchSize(actualBatchSize); // create an npix x npix x ncol image w/ arbitrary color value // model only has one input, so just pick begin() auto& input1 = iInput.begin()->second; - auto data1 = std::make_shared>(); - data1->reserve(batchSize_); - for (unsigned i = 0; i < batchSize_; ++i) { - data1->emplace_back(input1.sizeDims(), 0.5f); + auto data1 = input1.allocate(); + for (auto& vdata1 : *data1) { + vdata1.assign(input1.sizeDims(), 0.5f); } // convert to server format input1.toServer(data1); @@ -51,7 +65,7 @@ class TritonImageProducer : public TritonEDProducer<> { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; TritonClient::fillPSetDescription(desc); - desc.add("batchSize", 1); + desc.add("batchSize", 1); desc.add("topN", 5); desc.add("imageList"); //to ensure distinct cfi names @@ -82,7 +96,7 @@ class TritonImageProducer : public TritonEDProducer<> { } } - unsigned batchSize_; + int batchSize_; unsigned topN_; std::vector imageList_; }; diff --git a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py index b9683cdb9439d..1d669d72aca73 100644 --- a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py +++ b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py @@ -14,7 +14,10 @@ options.register("modelName","resnet50_netdef", VarParsing.multiplicity.singleton, VarParsing.varType.string) options.register("mode","Async", VarParsing.multiplicity.singleton, VarParsing.varType.string) options.register("verbose", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool) +options.register("brief", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool) options.register("unittest", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool) +options.register("testother", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool) +options.register("shm", True, VarParsing.multiplicity.singleton, VarParsing.varType.bool) options.register("device","auto", VarParsing.multiplicity.singleton, VarParsing.varType.string) options.register("docker", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool) options.register("tries", 0, VarParsing.multiplicity.singleton, VarParsing.varType.int) @@ -89,12 +92,13 @@ modelConfigPath = cms.FileInPath("HeterogeneousCore/SonicTriton/data/models/{}/config.pbtxt".format(models[module])), verbose = cms.untracked.bool(options.verbose), allowedTries = cms.untracked.uint32(options.tries), + useSharedMemory = cms.untracked.bool(options.shm), ) ) ) processModule = getattr(process, module) if module=="TritonImageProducer": - processModule.batchSize = cms.uint32(1) + processModule.batchSize = cms.int32(1) processModule.topN = cms.uint32(5) processModule.imageList = cms.FileInPath("HeterogeneousCore/SonicTriton/data/models/resnet50_netdef/resnet50_labels.txt") elif "TritonGraph" in module: @@ -109,8 +113,19 @@ processModule.nodeMax = cms.uint32(4000) processModule.edgeMin = cms.uint32(8000) processModule.edgeMax = cms.uint32(15000) + processModule.brief = cms.bool(options.brief) process.p += processModule keepMsgs.extend([module,module+':TritonClient']) + if options.testother: + # clone modules to test both gRPC and shared memory + _module2 = module+"GRPC" if processModule.Client.useSharedMemory else "SHM" + setattr(process, _module2, + processModule.clone( + Client = dict(useSharedMemory = not processModule.Client.useSharedMemory) + ) + ) + processModule2 = getattr(process, _module2) + process.p += processModule2 process.load('FWCore/MessageService/MessageLogger_cfi') process.MessageLogger.cerr.FwkReport.reportEvery = 500 diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh index a428a98a723d8..7ae19df174da1 100755 --- a/HeterogeneousCore/SonicTriton/test/unittest.sh +++ b/HeterogeneousCore/SonicTriton/test/unittest.sh @@ -1,18 +1,28 @@ #!/bin/bash LOCALTOP=$1 +DEVICE=$2 # the test is not possible if: -# 1. avx instructions not supported (needed for singularity on CPU) +# 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU) # 2. singularity not found or not usable # 3. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity) # so just return true in those cases -if grep -q avx /proc/cpuinfo; then - echo "has avx" +if [ "$DEVICE" = "GPU" ]; then + if nvidia-smi -L; then + echo "has GPU" + else + echo "missing GPU" + exit 0 + fi else - echo "missing avx" - exit 0 + if grep -q avx /proc/cpuinfo; then + echo "has avx" + else + echo "missing avx" + exit 0 + fi fi if type singularity >& /dev/null; then @@ -32,7 +42,7 @@ if [ -n "$SINGULARITY_CONTAINER" ]; then fi tmpFile=$(mktemp -p ${LOCALTOP} SonicTritonTestXXXXXXXX.log) -cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=1 unittest=1 verbose=1 >& $tmpFile +cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 >& $tmpFile CMSEXIT=$? cat $tmpFile