diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
index 71959994e0044..5546e51ae6400 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
@@ -5,6 +5,8 @@
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
+#include <string>
+#include <string_view>
 
 // CUDA headers
 #include <cuda.h>
@@ -21,19 +23,22 @@ namespace cms {
                                               const char* cmd,
                                               const char* error,
                                               const char* message,
-                                              const char* description = nullptr) {
+                                              std::string_view description = std::string_view()) {
       std::ostringstream out;
       out << "\n";
       out << file << ", line " << line << ":\n";
       out << "cudaCheck(" << cmd << ");\n";
       out << error << ": " << message << "\n";
-      if (description)
+      if (!description.empty())
         out << description << "\n";
       throw std::runtime_error(out.str());
     }
 
-    inline bool cudaCheck_(
-        const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
+    inline bool cudaCheck_(const char* file,
+                           int line,
+                           const char* cmd,
+                           CUresult result,
+                           std::string_view description = std::string_view()) {
       if (LIKELY(result == CUDA_SUCCESS))
         return true;
 
@@ -45,8 +50,11 @@ namespace cms {
       return false;
     }
 
-    inline bool cudaCheck_(
-        const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
+    inline bool cudaCheck_(const char* file,
+                           int line,
+                           const char* cmd,
+                           cudaError_t result,
+                           std::string_view description = std::string_view()) {
       if (LIKELY(result == cudaSuccess))
         return true;
 
@@ -55,7 +63,6 @@ namespace cms {
       abortOnCudaError(file, line, cmd, error, message, description);
       return false;
     }
-
   }  // namespace cuda
 }  // namespace cms
 
diff --git a/HeterogeneousCore/SonicCore/src/SonicClientBase.cc b/HeterogeneousCore/SonicCore/src/SonicClientBase.cc
index bc8a551453334..42b64fb7e100a 100644
--- a/HeterogeneousCore/SonicCore/src/SonicClientBase.cc
+++ b/HeterogeneousCore/SonicCore/src/SonicClientBase.cc
@@ -73,6 +73,10 @@ void SonicClientBase::finish(bool success, std::exception_ptr eptr) {
     holder_.reset();
   } else if (eptr)
     std::rethrow_exception(eptr);
+
+  //reset client data now (usually done at end of produce())
+  if (eptr)
+    reset();
 }
 
 void SonicClientBase::fillBasePSetDescription(edm::ParameterSetDescription& desc, bool allowRetry) {
diff --git a/HeterogeneousCore/SonicTriton/BuildFile.xml b/HeterogeneousCore/SonicTriton/BuildFile.xml
index 8efadafbe2869..16052f4119daa 100644
--- a/HeterogeneousCore/SonicTriton/BuildFile.xml
+++ b/HeterogeneousCore/SonicTriton/BuildFile.xml
@@ -4,8 +4,12 @@
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/Utilities"/>
 <use name="HeterogeneousCore/SonicCore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="triton-inference-server"/>
 <use name="protobuf"/>
+<iftool name="cuda">
+  <use name="cuda"/>
+</iftool>
 <export>
   <lib   name="1"/>
 </export>
diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md
index 6dbbb635caaab..f4f6af7f14a76 100644
--- a/HeterogeneousCore/SonicTriton/README.md
+++ b/HeterogeneousCore/SonicTriton/README.md
@@ -30,6 +30,12 @@ The model information from the server can be printed by enabling `verbose` outpu
 * `preferredServer`: name of preferred server, for testing (see [Services](#services) below)
 * `timeout`: maximum allowed time for a request
 * `outputs`: optional, specify which output(s) the server should send
+* `verbose`: enable verbose printouts (default: false)
+* `useSharedMemory`: enable use of shared memory (see [below](#shared-memory)) with local servers (default: true)
+
+The batch size should be set using the client accessor, in order to ensure a consistent value across all inputs:
+* `setBatchSize()`: set a new batch size
+  * some models may not support batching
 
 Useful `TritonData` accessors include:
 * `variableDims()`: return true if any variable dimensions
@@ -39,8 +45,6 @@ Useful `TritonData` accessors include:
 * `byteSize()`: return number of bytes for data type
 * `dname()`: return name of data type
 * `batchSize()`: return current batch size
-* `setBatchSize()`: set a new batch size
-  * some models may not support batching
 
 To update the `TritonData` shape in the variable-dimension case:
 * `setShape(const std::vector<int64_t>& newShape)`: update all (variable) dimensions with values provided in `newShape`
@@ -49,9 +53,28 @@ To update the `TritonData` shape in the variable-dimension case:
 There are specific local input and output containers that should be used in producers.
 Here, `T` is a primitive type, and the two aliases listed below are passed to `TritonInputData::toServer()`
 and returned by `TritonOutputData::fromServer()`, respectively:
-* `TritonInput<T> = std::vector<std::vector<T>>`
+* `TritonInputContainer<T> = std::shared_ptr<TritonInput<T>> = std::shared_ptr<std::vector<std::vector<T>>>`
 * `TritonOutput<T> = std::vector<edm::Span<const T*>>`
 
+The `TritonInputContainer` object should be created using the helper function described below.
+It expects one vector per batch entry (i.e. the size of the outer vector is the batch size).
+Therefore, it is best to call `TritonClient::setBatchSize()`, if necessary, before calling the helper.
+It will also reserve the expected size of the input in each inner vector (by default),
+if the concrete shape is available (i.e. `setShape()` was already called, if the input has variable dimensions).
+* `allocate<T>()`: return a `TritonInputContainer` properly allocated for the batch and input sizes
+
+### Shared memory
+
+If the local fallback server (see [Services](#services) below) is in use,
+input and output data can be transferred via shared memory rather than gRPC.
+Both CPU and GPU (CUDA) shared memory are supported.
+This is more efficient for some algorithms;
+if shared memory is not more efficient for an algorithm, it can be disabled in the Python configuration for the client.
+
+For outputs, shared memory can only be used if the batch size and concrete shape are known in advance,
+because the shared memory region for the output must be registered before the inference call is made.
+As with the inputs, this is handled automatically, and the use of shared memory can be disabled if desired.
+
 ## Modules
 
 SONIC Triton supports producers, filters, and analyzers.
@@ -71,7 +94,7 @@ If an `edm::GlobalCache` of type `T` is needed, there are two changes:
 In a SONIC Triton producer, the basic flow should follow this pattern:
 1. `acquire()`:  
     a. access input object(s) from `TritonInputMap`  
-    b. allocate input data using `std::make_shared<TritonInput<T>>()`  
+    b. allocate input data using `allocate<T>()`  
     c. fill input data  
     d. set input shape(s) (optional, only if any variable dimensions)  
     e. convert using `toServer()` function of input object(s)  
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonClient.h b/HeterogeneousCore/SonicTriton/interface/TritonClient.h
index ccda4a3c6ec6a..61f27bddd808c 100644
--- a/HeterogeneousCore/SonicTriton/interface/TritonClient.h
+++ b/HeterogeneousCore/SonicTriton/interface/TritonClient.h
@@ -5,6 +5,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "HeterogeneousCore/SonicCore/interface/SonicClient.h"
 #include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonService.h"
 
 #include <map>
 #include <vector>
@@ -31,20 +32,28 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
   //constructor
   TritonClient(const edm::ParameterSet& params, const std::string& debugName);
 
+  //destructor
+  ~TritonClient() override;
+
   //accessors
   unsigned batchSize() const { return batchSize_; }
   bool verbose() const { return verbose_; }
+  bool useSharedMemory() const { return useSharedMemory_; }
+  void setUseSharedMemory(bool useShm) { useSharedMemory_ = useShm; }
   bool setBatchSize(unsigned bsize);
   void reset() override;
+  bool noBatch() const { return noBatch_; }
+  TritonServerType serverType() const { return serverType_; }
 
   //for fillDescriptions
   static void fillPSetDescription(edm::ParameterSetDescription& iDesc);
 
 protected:
-  //helper
-  bool getResults(std::shared_ptr<nvidia::inferenceserver::client::InferResult> results);
-
+  //helpers
+  void getResults(std::shared_ptr<nvidia::inferenceserver::client::InferResult> results);
   void evaluate() override;
+  template <typename F>
+  bool handle_exception(F&& call);
 
   void reportServerSideStats(const ServerSideStats& stats) const;
   ServerSideStats summarizeServerStats(const inference::ModelStatistics& start_status,
@@ -57,6 +66,8 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
   unsigned batchSize_;
   bool noBatch_;
   bool verbose_;
+  bool useSharedMemory_;
+  TritonServerType serverType_;
 
   //IO pointers for triton
   std::vector<nvidia::inferenceserver::client::InferInput*> inputsTriton_;
@@ -65,6 +76,13 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
   std::unique_ptr<nvidia::inferenceserver::client::InferenceServerGrpcClient> client_;
   //stores timeout, model name and version
   nvidia::inferenceserver::client::InferOptions options_;
+
+private:
+  friend TritonInputData;
+  friend TritonOutputData;
+
+  //private accessors only used by data
+  auto client() { return client_.get(); }
 };
 
 #endif
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonData.h b/HeterogeneousCore/SonicTriton/interface/TritonData.h
index 50808de4a1216..2f5f6d8760bc6 100644
--- a/HeterogeneousCore/SonicTriton/interface/TritonData.h
+++ b/HeterogeneousCore/SonicTriton/interface/TritonData.h
@@ -10,13 +10,23 @@
 #include <numeric>
 #include <algorithm>
 #include <memory>
-#include <any>
+#include <atomic>
 
 #include "grpc_client.h"
 #include "grpc_service.pb.h"
 
 //forward declaration
 class TritonClient;
+template <typename IO>
+class TritonMemResource;
+template <typename IO>
+class TritonHeapResource;
+template <typename IO>
+class TritonCpuShmResource;
+#ifdef TRITON_ENABLE_GPU
+template <typename IO>
+class TritonGpuShmResource;
+#endif
 
 //aliases for local input and output types
 template <typename DT>
@@ -24,6 +34,10 @@ using TritonInput = std::vector<std::vector<DT>>;
 template <typename DT>
 using TritonOutput = std::vector<edm::Span<const DT*>>;
 
+//other useful typdefs
+template <typename DT>
+using TritonInputContainer = std::shared_ptr<TritonInput<DT>>;
+
 //store all the info needed for triton input and output
 template <typename IO>
 class TritonData {
@@ -34,15 +48,18 @@ class TritonData {
   using ShapeView = edm::Span<ShapeType::const_iterator>;
 
   //constructor
-  TritonData(const std::string& name, const TensorMetadata& model_info, bool noBatch);
+  TritonData(const std::string& name, const TensorMetadata& model_info, TritonClient* client, const std::string& pid);
 
   //some members can be modified
-  bool setShape(const ShapeType& newShape) { return setShape(newShape, true); }
-  bool setShape(unsigned loc, int64_t val) { return setShape(loc, val, true); }
+  void setShape(const ShapeType& newShape);
+  void setShape(unsigned loc, int64_t val);
 
   //io accessors
   template <typename DT>
-  void toServer(std::shared_ptr<TritonInput<DT>> ptr);
+  TritonInputContainer<DT> allocate(bool reserve = true);
+  template <typename DT>
+  void toServer(TritonInputContainer<DT> ptr);
+  void prepare();
   template <typename DT>
   TritonOutput<DT> fromServer() const;
 
@@ -60,14 +77,23 @@ class TritonData {
 
 private:
   friend class TritonClient;
+  friend class TritonMemResource<IO>;
+  friend class TritonHeapResource<IO>;
+  friend class TritonCpuShmResource<IO>;
+#ifdef TRITON_ENABLE_GPU
+  friend class TritonGpuShmResource<IO>;
+#endif
 
-  //private accessors only used by client
-  bool setShape(const ShapeType& newShape, bool canThrow);
-  bool setShape(unsigned loc, int64_t val, bool canThrow);
+  //private accessors only used internally or by client
+  unsigned fullLoc(unsigned loc) const { return loc + (noBatch_ ? 0 : 1); }
   void setBatchSize(unsigned bsize);
   void reset();
   void setResult(std::shared_ptr<Result> result) { result_ = result; }
   IO* data() { return data_.get(); }
+  void updateMem(size_t size);
+  void computeSizes();
+  void resetSizes();
+  nvidia::inferenceserver::client::InferenceServerGrpcClient* client();
 
   //helpers
   bool anyNeg(const ShapeView& vec) const {
@@ -76,11 +102,20 @@ class TritonData {
   int64_t dimProduct(const ShapeView& vec) const {
     return std::accumulate(vec.begin(), vec.end(), 1, std::multiplies<int64_t>());
   }
-  void createObject(IO** ioptr) const;
+  void createObject(IO** ioptr);
+  //generates a unique id number for each instance of the class
+  unsigned uid() const {
+    static std::atomic<unsigned> uid{0};
+    return ++uid;
+  }
+  std::string xput() const;
 
   //members
   std::string name_;
   std::shared_ptr<IO> data_;
+  TritonClient* client_;
+  bool useShm_;
+  std::string shmName_;
   const ShapeType dims_;
   bool noBatch_;
   unsigned batchSize_;
@@ -91,7 +126,11 @@ class TritonData {
   std::string dname_;
   inference::DataType dtype_;
   int64_t byteSize_;
-  std::any holder_;
+  size_t sizeShape_;
+  size_t byteSizePerBatch_;
+  size_t totalByteSize_;
+  std::shared_ptr<void> holder_;
+  std::shared_ptr<TritonMemResource<IO>> memResource_;
   std::shared_ptr<Result> result_;
 };
 
@@ -102,9 +141,18 @@ using TritonOutputMap = std::unordered_map<std::string, TritonOutputData>;
 
 //avoid "explicit specialization after instantiation" error
 template <>
+std::string TritonInputData::xput() const;
+template <>
+std::string TritonOutputData::xput() const;
+template <>
+template <typename DT>
+TritonInputContainer<DT> TritonInputData::allocate(bool reserve);
+template <>
 template <typename DT>
 void TritonInputData::toServer(std::shared_ptr<TritonInput<DT>> ptr);
 template <>
+void TritonOutputData::prepare();
+template <>
 template <typename DT>
 TritonOutput<DT> TritonOutputData::fromServer() const;
 template <>
@@ -112,9 +160,9 @@ void TritonInputData::reset();
 template <>
 void TritonOutputData::reset();
 template <>
-void TritonInputData::createObject(nvidia::inferenceserver::client::InferInput** ioptr) const;
+void TritonInputData::createObject(nvidia::inferenceserver::client::InferInput** ioptr);
 template <>
-void TritonOutputData::createObject(nvidia::inferenceserver::client::InferRequestedOutput** ioptr) const;
+void TritonOutputData::createObject(nvidia::inferenceserver::client::InferRequestedOutput** ioptr);
 
 //explicit template instantiation declarations
 extern template class TritonData<nvidia::inferenceserver::client::InferInput>;
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonException.h b/HeterogeneousCore/SonicTriton/interface/TritonException.h
new file mode 100644
index 0000000000000..418a98fc1bd8e
--- /dev/null
+++ b/HeterogeneousCore/SonicTriton/interface/TritonException.h
@@ -0,0 +1,14 @@
+#ifndef HeterogeneousCore_SonicTriton_TritonException
+#define HeterogeneousCore_SonicTriton_TritonException
+
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include <string>
+
+class TritonException : public cms::Exception {
+public:
+  explicit TritonException(std::string const& aCategory);
+  void convertToWarning() const;
+};
+
+#endif
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h b/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h
new file mode 100644
index 0000000000000..80c810752bfb6
--- /dev/null
+++ b/HeterogeneousCore/SonicTriton/interface/TritonMemResource.h
@@ -0,0 +1,99 @@
+#ifndef HeterogeneousCore_SonicTriton_TritonMemResource
+#define HeterogeneousCore_SonicTriton_TritonMemResource
+
+#include <string>
+#include <memory>
+
+#include "grpc_client.h"
+
+//forward declaration
+template <typename IO>
+class TritonData;
+
+//base class for memory operations
+template <typename IO>
+class TritonMemResource {
+public:
+  TritonMemResource(TritonData<IO>* data, const std::string& name, size_t size);
+  virtual ~TritonMemResource() {}
+  uint8_t* addr() { return addr_; }
+  size_t size() const { return size_; }
+  virtual void close() {}
+  //used for input
+  virtual void copyInput(const void* values, size_t offset) {}
+  //used for output
+  virtual const uint8_t* copyOutput() { return nullptr; }
+  virtual void set();
+
+protected:
+  //member variables
+  TritonData<IO>* data_;
+  std::string name_;
+  size_t size_;
+  uint8_t* addr_;
+  bool closed_;
+};
+
+template <typename IO>
+class TritonHeapResource : public TritonMemResource<IO> {
+public:
+  TritonHeapResource(TritonData<IO>* data, const std::string& name, size_t size);
+  ~TritonHeapResource() override {}
+  void copyInput(const void* values, size_t offset) override {}
+  const uint8_t* copyOutput() override { return nullptr; }
+  void set() override {}
+};
+
+template <typename IO>
+class TritonCpuShmResource : public TritonMemResource<IO> {
+public:
+  TritonCpuShmResource(TritonData<IO>* data, const std::string& name, size_t size);
+  ~TritonCpuShmResource() override;
+  void close() override;
+  void copyInput(const void* values, size_t offset) override {}
+  const uint8_t* copyOutput() override { return nullptr; }
+};
+
+using TritonInputHeapResource = TritonHeapResource<nvidia::inferenceserver::client::InferInput>;
+using TritonInputCpuShmResource = TritonCpuShmResource<nvidia::inferenceserver::client::InferInput>;
+using TritonOutputHeapResource = TritonHeapResource<nvidia::inferenceserver::client::InferRequestedOutput>;
+using TritonOutputCpuShmResource = TritonCpuShmResource<nvidia::inferenceserver::client::InferRequestedOutput>;
+
+//avoid "explicit specialization after instantiation" error
+template <>
+void TritonInputHeapResource::copyInput(const void* values, size_t offset);
+template <>
+void TritonInputCpuShmResource::copyInput(const void* values, size_t offset);
+template <>
+const uint8_t* TritonOutputHeapResource::copyOutput();
+template <>
+const uint8_t* TritonOutputCpuShmResource::copyOutput();
+
+#ifdef TRITON_ENABLE_GPU
+#include "cuda_runtime_api.h"
+
+template <typename IO>
+class TritonGpuShmResource : public TritonMemResource<IO> {
+public:
+  TritonGpuShmResource(TritonData<IO>* data, const std::string& name, size_t size);
+  ~TritonGpuShmResource() override;
+  void close() override;
+  void copyInput(const void* values, size_t offset) override {}
+  const uint8_t* copyOutput() override { return nullptr; }
+
+protected:
+  int deviceId_;
+  std::shared_ptr<cudaIpcMemHandle_t> handle_;
+};
+
+using TritonInputGpuShmResource = TritonGpuShmResource<nvidia::inferenceserver::client::InferInput>;
+using TritonOutputGpuShmResource = TritonGpuShmResource<nvidia::inferenceserver::client::InferRequestedOutput>;
+
+//avoid "explicit specialization after instantiation" error
+template <>
+void TritonInputGpuShmResource::copyInput(const void* values, size_t offset);
+template <>
+const uint8_t* TritonOutputGpuShmResource::copyOutput();
+#endif
+
+#endif
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonService.h b/HeterogeneousCore/SonicTriton/interface/TritonService.h
index 24232b6894ae1..9adc8d634c7d2 100644
--- a/HeterogeneousCore/SonicTriton/interface/TritonService.h
+++ b/HeterogeneousCore/SonicTriton/interface/TritonService.h
@@ -19,6 +19,8 @@ namespace edm {
   class ModuleDescription;
 }  // namespace edm
 
+enum class TritonServerType { Remote = 0, LocalCPU = 1, LocalGPU = 2 };
+
 class TritonService {
 public:
   //classes and defs
@@ -79,7 +81,9 @@ class TritonService {
 
   //accessors
   void addModel(const std::string& modelName, const std::string& path);
-  std::pair<std::string, bool> serverAddress(const std::string& model, const std::string& preferred = "") const;
+  std::pair<std::string, TritonServerType> serverAddress(const std::string& model,
+                                                         const std::string& preferred = "") const;
+  const std::string& pid() const { return pid_; }
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -94,6 +98,7 @@ class TritonService {
   unsigned currentModuleId_;
   bool allowAddModel_;
   bool startedFallback_;
+  std::string pid_;
   std::unordered_map<std::string, Model> unservedModels_;
   //this represents a many:many:many map
   std::unordered_map<std::string, Server> servers_;
diff --git a/HeterogeneousCore/SonicTriton/interface/triton_utils.h b/HeterogeneousCore/SonicTriton/interface/triton_utils.h
index 1be6bea4e18eb..05f0a74dec286 100644
--- a/HeterogeneousCore/SonicTriton/interface/triton_utils.h
+++ b/HeterogeneousCore/SonicTriton/interface/triton_utils.h
@@ -19,10 +19,6 @@ namespace triton_utils {
 
   //helper to turn triton error into exception
   void throwIfError(const Error& err, std::string_view msg);
-
-  //helper to turn triton error into warning
-  bool warnIfError(const Error& err, std::string_view msg);
-
 }  // namespace triton_utils
 
 extern template std::string triton_utils::printColl(const edm::Span<std::vector<int64_t>::const_iterator>& coll,
diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton
index d9a82a1be9c49..54a5310fb8c90 100755
--- a/HeterogeneousCore/SonicTriton/scripts/cmsTriton
+++ b/HeterogeneousCore/SonicTriton/scripts/cmsTriton
@@ -4,7 +4,7 @@
 USEDOCKER=""
 GPU=""
 VERBOSE=""
-VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-info=1"
+VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
 WTIME=120
 SERVER=triton_server_instance
 RETRIES=3
@@ -177,7 +177,7 @@ start_docker(){
 	done
 
 	$DRYRUN $DOCKER run -d --name ${SERVER} \
-		--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
+		--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
 		-p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
 		${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
 }
diff --git a/HeterogeneousCore/SonicTriton/src/TritonClient.cc b/HeterogeneousCore/SonicTriton/src/TritonClient.cc
index f26a09392ba81..655af4ef3cdba 100644
--- a/HeterogeneousCore/SonicTriton/src/TritonClient.cc
+++ b/HeterogeneousCore/SonicTriton/src/TritonClient.cc
@@ -3,6 +3,7 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonException.h"
 #include "HeterogeneousCore/SonicTriton/interface/TritonService.h"
 #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h"
 
@@ -26,16 +27,18 @@ namespace nic = ni::client;
 TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& debugName)
     : SonicClient(params, debugName, "TritonClient"),
       verbose_(params.getUntrackedParameter<bool>("verbose")),
+      useSharedMemory_(params.getUntrackedParameter<bool>("useSharedMemory")),
       options_(params.getParameter<std::string>("modelName")) {
   //get appropriate server for this model
   edm::Service<TritonService> ts;
-  const auto& [url, isFallbackCPU] =
+  const auto& [url, serverType] =
       ts->serverAddress(options_.model_name_, params.getUntrackedParameter<std::string>("preferredServer"));
+  serverType_ = serverType;
   if (verbose_)
     edm::LogInfo(fullDebugName_) << "Using server: " << url;
   //enforce sync mode for fallback CPU server to avoid contention
   //todo: could enforce async mode otherwise (unless mode was specified by user?)
-  if (isFallbackCPU)
+  if (serverType_ == TritonServerType::LocalCPU)
     setMode(SonicMode::Sync);
 
   //connect to the server
@@ -95,8 +98,9 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d
   inputsTriton_.reserve(nicInputs.size());
   for (const auto& nicInput : nicInputs) {
     const auto& iname = nicInput.name();
-    auto [curr_itr, success] = input_.emplace(
-        std::piecewise_construct, std::forward_as_tuple(iname), std::forward_as_tuple(iname, nicInput, noBatch_));
+    auto [curr_itr, success] = input_.emplace(std::piecewise_construct,
+                                              std::forward_as_tuple(iname),
+                                              std::forward_as_tuple(iname, nicInput, this, ts->pid()));
     auto& curr_input = curr_itr->second;
     inputsTriton_.push_back(curr_input.data());
     if (verbose_) {
@@ -118,8 +122,9 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d
     const auto& oname = nicOutput.name();
     if (!s_outputs.empty() and s_outputs.find(oname) == s_outputs.end())
       continue;
-    auto [curr_itr, success] = output_.emplace(
-        std::piecewise_construct, std::forward_as_tuple(oname), std::forward_as_tuple(oname, nicOutput, noBatch_));
+    auto [curr_itr, success] = output_.emplace(std::piecewise_construct,
+                                               std::forward_as_tuple(oname),
+                                               std::forward_as_tuple(oname, nicOutput, this, ts->pid()));
     auto& curr_output = curr_itr->second;
     outputsTriton_.push_back(curr_output.data());
     if (verbose_) {
@@ -148,6 +153,15 @@ TritonClient::TritonClient(const edm::ParameterSet& params, const std::string& d
   }
 }
 
+TritonClient::~TritonClient() {
+  //by default: members of this class destroyed before members of base class
+  //in shared memory case, TritonMemResource (member of TritonData) unregisters from client_ in its destructor
+  //but input/output objects are member of base class, so destroyed after client_ (member of this class)
+  //therefore, clear the maps here
+  input_.clear();
+  output_.clear();
+}
+
 bool TritonClient::setBatchSize(unsigned bsize) {
   if (bsize > maxBatchSize_) {
     edm::LogWarning(fullDebugName_) << "Requested batch size " << bsize << " exceeds server-specified max batch size "
@@ -175,22 +189,39 @@ void TritonClient::reset() {
   }
 }
 
-bool TritonClient::getResults(std::shared_ptr<nic::InferResult> results) {
+template <typename F>
+bool TritonClient::handle_exception(F&& call) {
+  //caught exceptions will be propagated to edm::WaitingTaskWithArenaHolder
+  CMS_SA_ALLOW try {
+    call();
+    return true;
+  }
+  //TritonExceptions are intended/expected to be recoverable, i.e. retries should be allowed
+  catch (TritonException& e) {
+    e.convertToWarning();
+    finish(false);
+    return false;
+  }
+  //other exceptions are not: execution should stop if they are encountered
+  catch (...) {
+    finish(false, std::current_exception());
+    return false;
+  }
+}
+
+void TritonClient::getResults(std::shared_ptr<nic::InferResult> results) {
   for (auto& [oname, output] : output_) {
     //set shape here before output becomes const
     if (output.variableDims()) {
       std::vector<int64_t> tmp_shape;
-      bool status = triton_utils::warnIfError(results->Shape(oname, &tmp_shape),
-                                              "getResults(): unable to get output shape for " + oname);
-      if (!status)
-        return status;
-      output.setShape(tmp_shape, false);
+      triton_utils::throwIfError(results->Shape(oname, &tmp_shape),
+                                 "getResults(): unable to get output shape for " + oname);
+      output.setShape(tmp_shape);
+      output.computeSizes();
     }
     //extend lifetime
     output.setResult(results);
   }
-
-  return true;
 }
 
 //default case for sync and pseudo async
@@ -201,76 +232,101 @@ void TritonClient::evaluate() {
     return;
   }
 
+  //set up shared memory for output
+  auto success = handle_exception([&]() {
+    for (auto& element : output_) {
+      element.second.prepare();
+    }
+  });
+  if (!success)
+    return;
+
   // Get the status of the server prior to the request being made.
-  const auto& start_status = getServerSideStatus();
+  inference::ModelStatistics start_status;
+  success = handle_exception([&]() {
+    if (verbose())
+      start_status = getServerSideStatus();
+  });
+  if (!success)
+    return;
 
   if (mode_ == SonicMode::Async) {
     //non-blocking call
     auto t1 = std::chrono::high_resolution_clock::now();
-    bool status = triton_utils::warnIfError(
-        client_->AsyncInfer(
-            [t1, start_status, this](nic::InferResult* results) {
-              //get results
-              std::shared_ptr<nic::InferResult> results_ptr(results);
-              bool status = triton_utils::warnIfError(results_ptr->RequestStatus(), "evaluate(): unable to get result");
-              if (!status) {
-                finish(false);
-                return;
-              }
-              auto t2 = std::chrono::high_resolution_clock::now();
-
-              if (!debugName_.empty())
-                edm::LogInfo(fullDebugName_)
-                    << "Remote time: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
-
-              const auto& end_status = getServerSideStatus();
-
-              if (verbose()) {
-                const auto& stats = summarizeServerStats(start_status, end_status);
-                reportServerSideStats(stats);
-              }
-
-              //check result
-              status = getResults(results_ptr);
-
-              //finish
-              finish(status);
-            },
-            options_,
-            inputsTriton_,
-            outputsTriton_),
-        "evaluate(): unable to launch async run");
-
-    //if AsyncRun failed, finish() wasn't called
-    if (!status)
-      finish(false);
+    success = handle_exception([&]() {
+      triton_utils::throwIfError(
+          client_->AsyncInfer(
+              [t1, start_status, this](nic::InferResult* results) {
+                //get results
+                std::shared_ptr<nic::InferResult> results_ptr(results);
+                auto success = handle_exception([&]() {
+                  triton_utils::throwIfError(results_ptr->RequestStatus(), "evaluate(): unable to get result");
+                });
+                if (!success)
+                  return;
+                auto t2 = std::chrono::high_resolution_clock::now();
+
+                if (!debugName_.empty())
+                  edm::LogInfo(fullDebugName_)
+                      << "Remote time: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+
+                if (verbose()) {
+                  inference::ModelStatistics end_status;
+                  success = handle_exception([&]() { end_status = getServerSideStatus(); });
+                  if (!success)
+                    return;
+
+                  const auto& stats = summarizeServerStats(start_status, end_status);
+                  reportServerSideStats(stats);
+                }
+
+                //check result
+                success = handle_exception([&]() { getResults(results_ptr); });
+                if (!success)
+                  return;
+
+                //finish
+                finish(true);
+              },
+              options_,
+              inputsTriton_,
+              outputsTriton_),
+          "evaluate(): unable to launch async run");
+    });
+    if (!success)
+      return;
   } else {
     //blocking call
     auto t1 = std::chrono::high_resolution_clock::now();
     nic::InferResult* results;
-    bool status = triton_utils::warnIfError(client_->Infer(&results, options_, inputsTriton_, outputsTriton_),
-                                            "evaluate(): unable to run and/or get result");
-    if (!status) {
-      finish(false);
+    success = handle_exception([&]() {
+      triton_utils::throwIfError(client_->Infer(&results, options_, inputsTriton_, outputsTriton_),
+                                 "evaluate(): unable to run and/or get result");
+    });
+    if (!success)
       return;
-    }
 
     auto t2 = std::chrono::high_resolution_clock::now();
     if (!debugName_.empty())
       edm::LogInfo(fullDebugName_) << "Remote time: "
                                    << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
 
-    const auto& end_status = getServerSideStatus();
-
     if (verbose()) {
+      inference::ModelStatistics end_status;
+      success = handle_exception([&]() { end_status = getServerSideStatus(); });
+      if (!success)
+        return;
+
       const auto& stats = summarizeServerStats(start_status, end_status);
       reportServerSideStats(stats);
     }
 
     std::shared_ptr<nic::InferResult> results_ptr(results);
-    status = getResults(results_ptr);
+    success = handle_exception([&]() { getResults(results_ptr); });
+    if (!success)
+      return;
 
-    finish(status);
+    finish(true);
   }
 }
 
@@ -335,11 +391,9 @@ TritonClient::ServerSideStats TritonClient::summarizeServerStats(const inference
 inference::ModelStatistics TritonClient::getServerSideStatus() const {
   if (verbose_) {
     inference::ModelStatisticsResponse resp;
-    bool success = triton_utils::warnIfError(
-        client_->ModelInferenceStatistics(&resp, options_.model_name_, options_.model_version_),
-        "getServerSideStatus(): unable to get model statistics");
-    if (success)
-      return *(resp.model_stats().begin());
+    triton_utils::throwIfError(client_->ModelInferenceStatistics(&resp, options_.model_name_, options_.model_version_),
+                               "getServerSideStatus(): unable to get model statistics");
+    return *(resp.model_stats().begin());
   }
   return inference::ModelStatistics{};
 }
@@ -355,6 +409,7 @@ void TritonClient::fillPSetDescription(edm::ParameterSetDescription& iDesc) {
   descClient.addUntracked<std::string>("preferredServer", "");
   descClient.addUntracked<unsigned>("timeout");
   descClient.addUntracked<bool>("verbose", false);
+  descClient.addUntracked<bool>("useSharedMemory", true);
   descClient.addUntracked<std::vector<std::string>>("outputs", {});
   iDesc.add<edm::ParameterSetDescription>("Client", descClient);
 }
diff --git a/HeterogeneousCore/SonicTriton/src/TritonData.cc b/HeterogeneousCore/SonicTriton/src/TritonData.cc
index 258671be07691..5cd2767123da8 100644
--- a/HeterogeneousCore/SonicTriton/src/TritonData.cc
+++ b/HeterogeneousCore/SonicTriton/src/TritonData.cc
@@ -1,10 +1,11 @@
 #include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonMemResource.h"
 #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 
 #include "model_config.pb.h"
 
-#include <cstring>
 #include <sstream>
 
 namespace ni = nvidia::inferenceserver;
@@ -22,10 +23,17 @@ namespace nvidia {
 //fullShape: if batching is enabled, first entry is batch size; values can be modified
 //shape: view into fullShape, excluding batch size entry
 template <typename IO>
-TritonData<IO>::TritonData(const std::string& name, const TritonData<IO>::TensorMetadata& model_info, bool noBatch)
+TritonData<IO>::TritonData(const std::string& name,
+                           const TritonData<IO>::TensorMetadata& model_info,
+                           TritonClient* client,
+                           const std::string& pid)
     : name_(name),
+      client_(client),
+      useShm_(client_->useSharedMemory()),
+      //ensure unique name for shared memory region
+      shmName_(useShm_ ? pid + "_" + xput() + std::to_string(uid()) : ""),
       dims_(model_info.shape().begin(), model_info.shape().end()),
-      noBatch_(noBatch),
+      noBatch_(client_->noBatch()),
       batchSize_(0),
       fullShape_(dims_),
       shape_(fullShape_.begin() + (noBatch_ ? 0 : 1), fullShape_.end()),
@@ -33,7 +41,8 @@ TritonData<IO>::TritonData(const std::string& name, const TritonData<IO>::Tensor
       productDims_(variableDims_ ? -1 : dimProduct(shape_)),
       dname_(model_info.datatype()),
       dtype_(ni::ProtocolStringToDataType(dname_)),
-      byteSize_(ni::GetDataTypeByteSize(dtype_)) {
+      byteSize_(ni::GetDataTypeByteSize(dtype_)),
+      totalByteSize_(0) {
   //create input or output object
   IO* iotmp;
   createObject(&iotmp);
@@ -41,57 +50,56 @@ TritonData<IO>::TritonData(const std::string& name, const TritonData<IO>::Tensor
 }
 
 template <>
-void TritonInputData::createObject(nic::InferInput** ioptr) const {
+void TritonInputData::createObject(nic::InferInput** ioptr) {
   nic::InferInput::Create(ioptr, name_, fullShape_, dname_);
 }
 
 template <>
-void TritonOutputData::createObject(nic::InferRequestedOutput** ioptr) const {
+void TritonOutputData::createObject(nic::InferRequestedOutput** ioptr) {
   nic::InferRequestedOutput::Create(ioptr, name_);
+  //another specialization for output: can't use shared memory if output size is not known
+  useShm_ &= !variableDims_;
+}
+
+template <>
+std::string TritonInputData::xput() const {
+  return "input";
+}
+
+template <>
+std::string TritonOutputData::xput() const {
+  return "output";
+}
+
+template <typename IO>
+nic::InferenceServerGrpcClient* TritonData<IO>::client() {
+  return client_->client();
 }
 
 //setters
 template <typename IO>
-bool TritonData<IO>::setShape(const TritonData<IO>::ShapeType& newShape, bool canThrow) {
-  bool result = true;
+void TritonData<IO>::setShape(const TritonData<IO>::ShapeType& newShape) {
   for (unsigned i = 0; i < newShape.size(); ++i) {
-    result &= setShape(i, newShape[i], canThrow);
+    setShape(i, newShape[i]);
   }
-  return result;
 }
 
 template <typename IO>
-bool TritonData<IO>::setShape(unsigned loc, int64_t val, bool canThrow) {
-  std::stringstream msg;
-  unsigned full_loc = loc + (noBatch_ ? 0 : 1);
+void TritonData<IO>::setShape(unsigned loc, int64_t val) {
+  unsigned locFull = fullLoc(loc);
 
   //check boundary
-  if (full_loc >= fullShape_.size()) {
-    msg << name_ << " setShape(): dimension " << full_loc << " out of bounds (" << fullShape_.size() << ")";
-    if (canThrow)
-      throw cms::Exception("TritonDataError") << msg.str();
-    else {
-      edm::LogWarning("TritonDataWarning") << msg.str();
-      return false;
-    }
-  }
+  if (locFull >= fullShape_.size())
+    throw cms::Exception("TritonDataError")
+        << name_ << " setShape(): dimension " << locFull << " out of bounds (" << fullShape_.size() << ")";
 
-  if (val != fullShape_[full_loc]) {
-    if (dims_[full_loc] == -1) {
-      fullShape_[full_loc] = val;
-      return true;
-    } else {
-      msg << name_ << " setShape(): attempt to change value of non-variable shape dimension " << loc;
-      if (canThrow)
-        throw cms::Exception("TritonDataError") << msg.str();
-      else {
-        edm::LogWarning("TritonDataError") << msg.str();
-        return false;
-      }
-    }
+  if (val != fullShape_[locFull]) {
+    if (dims_[locFull] == -1)
+      fullShape_[locFull] = val;
+    else
+      throw cms::Exception("TritonDataError")
+          << name_ << " setShape(): attempt to change value of non-variable shape dimension " << loc;
   }
-
-  return true;
 }
 
 template <typename IO>
@@ -101,15 +109,71 @@ void TritonData<IO>::setBatchSize(unsigned bsize) {
     fullShape_[0] = batchSize_;
 }
 
+template <typename IO>
+void TritonData<IO>::computeSizes() {
+  sizeShape_ = sizeShape();
+  byteSizePerBatch_ = byteSize_ * sizeShape_;
+  totalByteSize_ = byteSizePerBatch_ * batchSize_;
+}
+template <typename IO>
+void TritonData<IO>::resetSizes() {
+  sizeShape_ = 0;
+  byteSizePerBatch_ = 0;
+  totalByteSize_ = 0;
+}
+
+//create a memory resource if none exists;
+//otherwise, reuse the memory resource, resizing it if necessary
+template <typename IO>
+void TritonData<IO>::updateMem(size_t size) {
+  if (!memResource_ or size > memResource_->size()) {
+    if (useShm_ and client_->serverType() == TritonServerType::LocalCPU) {
+      //avoid unnecessarily throwing in destructor
+      if (memResource_)
+        memResource_->close();
+      //need to destroy before constructing new instance because shared memory key will be reused
+      memResource_.reset();
+      memResource_ = std::make_shared<TritonCpuShmResource<IO>>(this, shmName_, size);
+    }
+#ifdef TRITON_ENABLE_GPU
+    else if (useShm_ and client_->serverType() == TritonServerType::LocalGPU) {
+      //avoid unnecessarily throwing in destructor
+      if (memResource_)
+        memResource_->close();
+      //need to destroy before constructing new instance because shared memory key will be reused
+      memResource_.reset();
+      memResource_ = std::make_shared<TritonGpuShmResource<IO>>(this, shmName_, size);
+    }
+#endif
+    //for remote/heap, size increases don't matter
+    else if (!memResource_)
+      memResource_ = std::make_shared<TritonHeapResource<IO>>(this, shmName_, size);
+  }
+}
+
 //io accessors
 template <>
 template <typename DT>
-void TritonInputData::toServer(std::shared_ptr<TritonInput<DT>> ptr) {
+TritonInputContainer<DT> TritonInputData::allocate(bool reserve) {
+  //automatically creates a vector for each batch entry (if batch size known)
+  auto ptr = std::make_shared<TritonInput<DT>>(batchSize_);
+  if (reserve and !anyNeg(shape_)) {
+    computeSizes();
+    for (auto& vec : *ptr) {
+      vec.reserve(sizeShape_);
+    }
+  }
+  return ptr;
+}
+
+template <>
+template <typename DT>
+void TritonInputData::toServer(TritonInputContainer<DT> ptr) {
   const auto& data_in = *ptr;
 
   //check batch size
   if (data_in.size() != batchSize_) {
-    throw cms::Exception("TritonDataError") << name_ << " input(): input vector has size " << data_in.size()
+    throw cms::Exception("TritonDataError") << name_ << " toServer(): input vector has size " << data_in.size()
                                             << " but specified batch size is " << batchSize_;
   }
 
@@ -117,48 +181,48 @@ void TritonInputData::toServer(std::shared_ptr<TritonInput<DT>> ptr) {
   data_->SetShape(fullShape_);
 
   if (byteSize_ != sizeof(DT))
-    throw cms::Exception("TritonDataError") << name_ << " input(): inconsistent byte size " << sizeof(DT)
+    throw cms::Exception("TritonDataError") << name_ << " toServer(): inconsistent byte size " << sizeof(DT)
                                             << " (should be " << byteSize_ << " for " << dname_ << ")";
 
-  int64_t nInput = sizeShape();
+  computeSizes();
+  updateMem(totalByteSize_);
   for (unsigned i0 = 0; i0 < batchSize_; ++i0) {
-    const DT* arr = data_in[i0].data();
-    triton_utils::throwIfError(data_->AppendRaw(reinterpret_cast<const uint8_t*>(arr), nInput * byteSize_),
-                               name_ + " input(): unable to set data for batch entry " + std::to_string(i0));
+    memResource_->copyInput(data_in[i0].data(), i0 * byteSizePerBatch_);
   }
+  memResource_->set();
 
   //keep input data in scope
-  holder_ = std::move(ptr);
+  holder_ = ptr;
+}
+
+//sets up shared memory for outputs, if possible
+template <>
+void TritonOutputData::prepare() {
+  computeSizes();
+  updateMem(totalByteSize_);
+  memResource_->set();
 }
 
 template <>
 template <typename DT>
 TritonOutput<DT> TritonOutputData::fromServer() const {
   if (!result_) {
-    throw cms::Exception("TritonDataError") << name_ << " output(): missing result";
+    throw cms::Exception("TritonDataError") << name_ << " fromServer(): missing result";
   }
 
   if (byteSize_ != sizeof(DT)) {
-    throw cms::Exception("TritonDataError") << name_ << " output(): inconsistent byte size " << sizeof(DT)
+    throw cms::Exception("TritonDataError") << name_ << " fromServer(): inconsistent byte size " << sizeof(DT)
                                             << " (should be " << byteSize_ << " for " << dname_ << ")";
   }
 
-  uint64_t nOutput = sizeShape();
-  TritonOutput<DT> dataOut;
-  const uint8_t* r0;
-  size_t contentByteSize;
-  size_t expectedContentByteSize = nOutput * byteSize_ * batchSize_;
-  triton_utils::throwIfError(result_->RawData(name_, &r0, &contentByteSize), "output(): unable to get raw");
-  if (contentByteSize != expectedContentByteSize) {
-    throw cms::Exception("TritonDataError") << name_ << " output(): unexpected content byte size " << contentByteSize
-                                            << " (expected " << expectedContentByteSize << ")";
-  }
-
+  const uint8_t* r0 = memResource_->copyOutput();
   const DT* r1 = reinterpret_cast<const DT*>(r0);
+
+  TritonOutput<DT> dataOut;
   dataOut.reserve(batchSize_);
   for (unsigned i0 = 0; i0 < batchSize_; ++i0) {
-    auto offset = i0 * nOutput;
-    dataOut.emplace_back(r1 + offset, r1 + offset + nOutput);
+    auto offset = i0 * sizeShape_;
+    dataOut.emplace_back(r1 + offset, r1 + offset + sizeShape_);
   }
 
   return dataOut;
@@ -166,20 +230,33 @@ TritonOutput<DT> TritonOutputData::fromServer() const {
 
 template <>
 void TritonInputData::reset() {
-  data_->Reset();
   holder_.reset();
+  data_->Reset();
+  //reset shape
+  if (variableDims_) {
+    for (unsigned i = 0; i < shape_.size(); ++i) {
+      unsigned locFull = fullLoc(i);
+      fullShape_[locFull] = dims_[locFull];
+    }
+  }
+  resetSizes();
 }
 
 template <>
 void TritonOutputData::reset() {
   result_.reset();
+  holder_.reset();
+  resetSizes();
 }
 
 //explicit template instantiation declarations
 template class TritonData<nic::InferInput>;
 template class TritonData<nic::InferRequestedOutput>;
 
-template void TritonInputData::toServer(std::shared_ptr<TritonInput<float>> data_in);
-template void TritonInputData::toServer(std::shared_ptr<TritonInput<int64_t>> data_in);
+template TritonInputContainer<float> TritonInputData::allocate(bool reserve);
+template TritonInputContainer<int64_t> TritonInputData::allocate(bool reserve);
+
+template void TritonInputData::toServer(TritonInputContainer<float> data_in);
+template void TritonInputData::toServer(TritonInputContainer<int64_t> data_in);
 
 template TritonOutput<float> TritonOutputData::fromServer() const;
diff --git a/HeterogeneousCore/SonicTriton/src/TritonException.cc b/HeterogeneousCore/SonicTriton/src/TritonException.cc
new file mode 100644
index 0000000000000..ee160ffe957c2
--- /dev/null
+++ b/HeterogeneousCore/SonicTriton/src/TritonException.cc
@@ -0,0 +1,6 @@
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonException.h"
+
+TritonException::TritonException(std::string const& aCategory) : cms::Exception(aCategory) {}
+
+void TritonException::convertToWarning() const { edm::LogWarning(category()) << explainSelf(); }
diff --git a/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc b/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc
new file mode 100644
index 0000000000000..7f1bff1f1db34
--- /dev/null
+++ b/HeterogeneousCore/SonicTriton/src/TritonMemResource.cc
@@ -0,0 +1,172 @@
+#include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonMemResource.h"
+#include "HeterogeneousCore/SonicTriton/interface/triton_utils.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cstring>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+namespace ni = nvidia::inferenceserver;
+namespace nic = ni::client;
+
+template <typename IO>
+TritonMemResource<IO>::TritonMemResource(TritonData<IO>* data, const std::string& name, size_t size)
+    : data_(data), name_(name), size_(size), addr_(nullptr), closed_(false) {}
+
+template <typename IO>
+void TritonMemResource<IO>::set() {
+  triton_utils::throwIfError(data_->data_->SetSharedMemory(name_, data_->totalByteSize_, 0),
+                             "unable to set shared memory (" + name_ + ")");
+}
+
+template <typename IO>
+TritonHeapResource<IO>::TritonHeapResource(TritonData<IO>* data, const std::string& name, size_t size)
+    : TritonMemResource<IO>(data, name, size) {}
+
+template <>
+void TritonInputHeapResource::copyInput(const void* values, size_t offset) {
+  triton_utils::throwIfError(
+      data_->data_->AppendRaw(reinterpret_cast<const uint8_t*>(values), data_->byteSizePerBatch_),
+      data_->name_ + " toServer(): unable to set data for batch entry " +
+          std::to_string(offset / data_->byteSizePerBatch_));
+}
+
+template <>
+const uint8_t* TritonOutputHeapResource::copyOutput() {
+  size_t contentByteSize;
+  const uint8_t* values;
+  triton_utils::throwIfError(data_->result_->RawData(data_->name_, &values, &contentByteSize),
+                             data_->name_ + " fromServer(): unable to get raw");
+  if (contentByteSize != data_->totalByteSize_) {
+    throw cms::Exception("TritonDataError") << data_->name_ << " fromServer(): unexpected content byte size "
+                                            << contentByteSize << " (expected " << data_->totalByteSize_ << ")";
+  }
+  return values;
+}
+
+//shared memory helpers based on:
+// https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/shm_utils.cc (cpu)
+// https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/simple_grpc_cudashm_client.cc (gpu)
+
+template <typename IO>
+TritonCpuShmResource<IO>::TritonCpuShmResource(TritonData<IO>* data, const std::string& name, size_t size)
+    : TritonMemResource<IO>(data, name, size) {
+  //get shared memory region descriptor
+  int shm_fd = shm_open(this->name_.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+  if (shm_fd == -1)
+    throw cms::Exception("TritonError") << "unable to get shared memory descriptor for key: " + this->name_;
+
+  //extend shared memory object
+  int res = ftruncate(shm_fd, this->size_);
+  if (res == -1)
+    throw cms::Exception("TritonError") << "unable to initialize shared memory key " + this->name_ +
+                                               " to requested size: " + std::to_string(this->size_);
+
+  //map to process address space
+  constexpr size_t offset(0);
+  this->addr_ = (uint8_t*)mmap(nullptr, this->size_, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, offset);
+  if (this->addr_ == MAP_FAILED)
+    throw cms::Exception("TritonError") << "unable to map to process address space for shared memory key: " +
+                                               this->name_;
+
+  //close descriptor
+  if (::close(shm_fd) == -1)
+    throw cms::Exception("TritonError") << "unable to close descriptor for shared memory key: " + this->name_;
+
+  triton_utils::throwIfError(this->data_->client()->RegisterSystemSharedMemory(this->name_, this->name_, this->size_),
+                             "unable to register shared memory region: " + this->name_);
+}
+
+template <typename IO>
+TritonCpuShmResource<IO>::~TritonCpuShmResource<IO>() {
+  close();
+}
+
+template <typename IO>
+void TritonCpuShmResource<IO>::close() {
+  if (this->closed_)
+    return;
+
+  triton_utils::throwIfError(this->data_->client()->UnregisterSystemSharedMemory(this->name_),
+                             "unable to unregister shared memory region: " + this->name_);
+
+  //unmap
+  int tmp_fd = munmap(this->addr_, this->size_);
+  if (tmp_fd == -1)
+    throw cms::Exception("TritonError") << "unable to munmap for shared memory key: " << this->name_;
+
+  //unlink
+  int shm_fd = shm_unlink(this->name_.c_str());
+  if (shm_fd == -1)
+    throw cms::Exception("TritonError") << "unable to unlink for shared memory key: " << this->name_;
+
+  this->closed_ = true;
+}
+
+template <>
+void TritonInputCpuShmResource::copyInput(const void* values, size_t offset) {
+  std::memcpy(addr_ + offset, values, data_->byteSizePerBatch_);
+}
+
+template <>
+const uint8_t* TritonOutputCpuShmResource::copyOutput() {
+  return addr_;
+}
+
+template class TritonHeapResource<nic::InferInput>;
+template class TritonCpuShmResource<nic::InferInput>;
+template class TritonHeapResource<nic::InferRequestedOutput>;
+template class TritonCpuShmResource<nic::InferRequestedOutput>;
+
+#ifdef TRITON_ENABLE_GPU
+template <typename IO>
+TritonGpuShmResource<IO>::TritonGpuShmResource(TritonData<IO>* data, const std::string& name, size_t size)
+    : TritonMemResource<IO>(data, name, size), deviceId_(0), handle_(std::make_shared<cudaIpcMemHandle_t>()) {
+  //todo: get server device id somehow?
+  cudaCheck(cudaSetDevice(deviceId_), "unable to set device ID to " + std::to_string(deviceId_));
+  cudaCheck(cudaMalloc((void**)&this->addr_, this->size_), "unable to allocate GPU memory for key: " + this->name_);
+  cudaCheck(cudaIpcGetMemHandle(handle_.get(), this->addr_), "unable to get IPC handle for key: " + this->name_);
+  triton_utils::throwIfError(
+      this->data_->client()->RegisterCudaSharedMemory(this->name_, *handle_, deviceId_, this->size_),
+      "unable to register CUDA shared memory region: " + this->name_);
+}
+
+template <typename IO>
+TritonGpuShmResource<IO>::~TritonGpuShmResource<IO>() {
+  close();
+}
+
+template <typename IO>
+void TritonGpuShmResource<IO>::close() {
+  if (this->closed_)
+    return;
+  triton_utils::throwIfError(this->data_->client()->UnregisterCudaSharedMemory(this->name_),
+                             "unable to unregister CUDA shared memory region: " + this->name_);
+  cudaCheck(cudaFree(this->addr_), "unable to free GPU memory for key: " + this->name_);
+  this->closed_ = true;
+}
+
+template <>
+void TritonInputGpuShmResource::copyInput(const void* values, size_t offset) {
+  cudaCheck(
+      cudaMemcpy(addr_ + offset, values, data_->byteSizePerBatch_, cudaMemcpyHostToDevice),
+      data_->name_ + " toServer(): unable to memcpy " + std::to_string(data_->byteSizePerBatch_) + " bytes to GPU");
+}
+
+template <>
+const uint8_t* TritonOutputGpuShmResource::copyOutput() {
+  //copy back from gpu, keep in scope
+  auto ptr = std::make_shared<std::vector<uint8_t>>(data_->totalByteSize_);
+  cudaCheck(
+      cudaMemcpy(ptr->data(), addr_, data_->totalByteSize_, cudaMemcpyDeviceToHost),
+      data_->name_ + " fromServer(): unable to memcpy " + std::to_string(data_->totalByteSize_) + " bytes from GPU");
+  data_->holder_ = ptr;
+  return ptr->data();
+}
+
+template class TritonGpuShmResource<nic::InferInput>;
+template class TritonGpuShmResource<nic::InferRequestedOutput>;
+#endif
diff --git a/HeterogeneousCore/SonicTriton/src/TritonService.cc b/HeterogeneousCore/SonicTriton/src/TritonService.cc
index 7ad47b4eded3f..9c3c0a31f100b 100644
--- a/HeterogeneousCore/SonicTriton/src/TritonService.cc
+++ b/HeterogeneousCore/SonicTriton/src/TritonService.cc
@@ -58,7 +58,8 @@ TritonService::TritonService(const edm::ParameterSet& pset, edm::ActivityRegistr
       fallbackOpts_(pset.getParameterSet("fallback")),
       currentModuleId_(0),
       allowAddModel_(false),
-      startedFallback_(false) {
+      startedFallback_(false),
+      pid_(std::to_string(::getpid())) {
   //module construction is assumed to be serial (correct at the time this code was written)
   areg.watchPreModuleConstruction(this, &TritonService::preModuleConstruction);
   areg.watchPostModuleConstruction(this, &TritonService::postModuleConstruction);
@@ -158,8 +159,8 @@ void TritonService::preModuleDestruction(edm::ModuleDescription const& desc) {
 }
 
 //second return value is only true if fallback CPU server is being used
-std::pair<std::string, bool> TritonService::serverAddress(const std::string& model,
-                                                          const std::string& preferred) const {
+std::pair<std::string, TritonServerType> TritonService::serverAddress(const std::string& model,
+                                                                      const std::string& preferred) const {
   auto mit = models_.find(model);
   if (mit == models_.end())
     throw cms::Exception("MissingModel") << "There are no servers that provide model " << model;
@@ -178,8 +179,16 @@ std::pair<std::string, bool> TritonService::serverAddress(const std::string& mod
 
   //todo: use some algorithm to select server rather than just picking arbitrarily
   const auto& serverInfo(servers_.find(serverName)->second);
-  bool isFallbackCPU = serverInfo.isFallback and !fallbackOpts_.useGPU;
-  return std::make_pair(serverInfo.url, isFallbackCPU);
+  auto serverType = TritonServerType::Remote;
+  if (serverInfo.isFallback) {
+    if (!fallbackOpts_.useGPU)
+      serverType = TritonServerType::LocalCPU;
+#ifdef TRITON_ENABLE_GPU
+    else
+      serverType = TritonServerType::LocalGPU;
+#endif
+  }
+  return std::make_pair(serverInfo.url, serverType);
 }
 
 void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm::ProcessContext const&) {
@@ -208,7 +217,7 @@ void TritonService::preBeginJob(edm::PathsAndConsumesOfModulesBase const&, edm::
   }
 
   //assemble server start command
-  std::string command("cmsTriton -P -1 -p " + std::to_string(::getpid()));
+  std::string command("cmsTriton -P -1 -p " + pid_);
   if (fallbackOpts_.debug)
     command += " -c";
   if (fallbackOpts_.verbose)
diff --git a/HeterogeneousCore/SonicTriton/src/triton_utils.cc b/HeterogeneousCore/SonicTriton/src/triton_utils.cc
index eeb56a6b8895f..e4460feebef92 100644
--- a/HeterogeneousCore/SonicTriton/src/triton_utils.cc
+++ b/HeterogeneousCore/SonicTriton/src/triton_utils.cc
@@ -1,4 +1,5 @@
 #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonException.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/Exception.h"
 
@@ -19,15 +20,8 @@ namespace triton_utils {
 
   void throwIfError(const Error& err, std::string_view msg) {
     if (!err.IsOk())
-      throw cms::Exception("TritonServerFailure") << msg << ": " << err;
+      throw TritonException("TritonFailure") << msg << (err.Message().empty() ? "" : ": " + err.Message());
   }
-
-  bool warnIfError(const Error& err, std::string_view msg) {
-    if (!err.IsOk())
-      edm::LogWarning("TritonServerWarning") << msg << ": " << err;
-    return err.IsOk();
-  }
-
 }  // namespace triton_utils
 
 template std::string triton_utils::printColl(const edm::Span<std::vector<int64_t>::const_iterator>& coll,
diff --git a/HeterogeneousCore/SonicTriton/test/BuildFile.xml b/HeterogeneousCore/SonicTriton/test/BuildFile.xml
index 63ff99aeaa9ca..1a02afd61be06 100644
--- a/HeterogeneousCore/SonicTriton/test/BuildFile.xml
+++ b/HeterogeneousCore/SonicTriton/test/BuildFile.xml
@@ -1,5 +1,6 @@
 <environment>
-  <test name="TestHeterogeneousCoreSonicTritonProducer" command="${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/unittest.sh ${LOCALTOP}"/>
+  <test name="TestHeterogeneousCoreSonicTritonProducerCPU" command="${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/unittest.sh ${LOCALTOP} CPU"/>
+  <test name="TestHeterogeneousCoreSonicTritonProducerGPU" command="${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/unittest.sh ${LOCALTOP} GPU"/>
   <library file="*.cc" name="testHeterogenousCoreSonicTriton">
     <flags EDM_PLUGIN="1"/>
     <use name="FWCore/ParameterSet"/>
diff --git a/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc b/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc
index 46ced5e330291..8e85a139cf6b9 100644
--- a/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc
+++ b/HeterogeneousCore/SonicTriton/test/TritonGraphModules.cc
@@ -15,7 +15,8 @@ class TritonGraphHelper {
       : nodeMin_(cfg.getParameter<unsigned>("nodeMin")),
         nodeMax_(cfg.getParameter<unsigned>("nodeMax")),
         edgeMin_(cfg.getParameter<unsigned>("edgeMin")),
-        edgeMax_(cfg.getParameter<unsigned>("edgeMax")) {}
+        edgeMax_(cfg.getParameter<unsigned>("edgeMax")),
+        brief_(cfg.getParameter<bool>("brief")) {}
   void makeInput(edm::Event const& iEvent, TritonInputMap& iInput) const {
     //get event-based seed for RNG
     unsigned int runNum_uint = static_cast<unsigned int>(iEvent.id().run());
@@ -32,15 +33,13 @@ class TritonGraphHelper {
     //set shapes
     auto& input1 = iInput.at("x__0");
     input1.setShape(0, nnodes);
-    auto data1 = std::make_shared<TritonInput<float>>(1);
+    auto data1 = input1.allocate<float>();
     auto& vdata1 = (*data1)[0];
-    vdata1.reserve(input1.sizeShape());
 
     auto& input2 = iInput.at("edgeindex__1");
     input2.setShape(1, nedges);
-    auto data2 = std::make_shared<TritonInput<int64_t>>(1);
+    auto data2 = input2.allocate<int64_t>();
     auto& vdata2 = (*data2)[0];
-    vdata2.reserve(input2.sizeShape());
 
     //fill
     std::normal_distribution<float> randx(-10, 4);
@@ -62,27 +61,32 @@ class TritonGraphHelper {
     const auto& output1 = iOutput.begin()->second;
     // convert from server format
     const auto& tmp = output1.fromServer<float>();
-    std::stringstream msg;
-    for (int i = 0; i < output1.shape()[0]; ++i) {
-      msg << "output " << i << ": ";
-      for (int j = 0; j < output1.shape()[1]; ++j) {
-        msg << tmp[0][output1.shape()[1] * i + j] << " ";
+    if (brief_)
+      edm::LogInfo(debugName) << "output shape: " << output1.shape()[0] << ", " << output1.shape()[1];
+    else {
+      edm::LogInfo msg(debugName);
+      for (int i = 0; i < output1.shape()[0]; ++i) {
+        msg << "output " << i << ": ";
+        for (int j = 0; j < output1.shape()[1]; ++j) {
+          msg << tmp[0][output1.shape()[1] * i + j] << " ";
+        }
+        msg << "\n";
       }
-      msg << "\n";
     }
-    edm::LogInfo(debugName) << msg.str();
   }
   static void fillPSetDescription(edm::ParameterSetDescription& desc) {
     desc.add<unsigned>("nodeMin", 100);
     desc.add<unsigned>("nodeMax", 4000);
     desc.add<unsigned>("edgeMin", 8000);
     desc.add<unsigned>("edgeMax", 15000);
+    desc.add<bool>("brief", false);
   }
 
 private:
   //members
   unsigned nodeMin_, nodeMax_;
   unsigned edgeMin_, edgeMax_;
+  bool brief_;
 };
 
 class TritonGraphProducer : public TritonEDProducer<> {
diff --git a/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc b/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc
index 4c602c659ea75..87fd2e9fad9df 100644
--- a/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc
+++ b/HeterogeneousCore/SonicTriton/test/TritonImageProducer.cc
@@ -10,12 +10,14 @@
 #include <string>
 #include <vector>
 #include <map>
+#include <cmath>
+#include <random>
 
 class TritonImageProducer : public TritonEDProducer<> {
 public:
   explicit TritonImageProducer(edm::ParameterSet const& cfg)
       : TritonEDProducer<>(cfg, "TritonImageProducer"),
-        batchSize_(cfg.getParameter<unsigned>("batchSize")),
+        batchSize_(cfg.getParameter<int>("batchSize")),
         topN_(cfg.getParameter<unsigned>("topN")) {
     //load score list
     std::string imageListFile(cfg.getParameter<edm::FileInPath>("imageList").fullPath());
@@ -30,14 +32,26 @@ class TritonImageProducer : public TritonEDProducer<> {
     }
   }
   void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, Input& iInput) override {
-    client_->setBatchSize(batchSize_);
+    int actualBatchSize = batchSize_;
+    //negative batch = generate random batch size from 1 to abs(batch)
+    if (batchSize_ < 0) {
+      //get event-based seed for RNG
+      unsigned int runNum_uint = static_cast<unsigned int>(iEvent.id().run());
+      unsigned int lumiNum_uint = static_cast<unsigned int>(iEvent.id().luminosityBlock());
+      unsigned int evNum_uint = static_cast<unsigned int>(iEvent.id().event());
+      std::uint32_t seed = (lumiNum_uint << 10) + (runNum_uint << 20) + evNum_uint;
+      std::mt19937 rng(seed);
+      std::uniform_int_distribution<int> randint(1, std::abs(batchSize_));
+      actualBatchSize = randint(rng);
+    }
+
+    client_->setBatchSize(actualBatchSize);
     // create an npix x npix x ncol image w/ arbitrary color value
     // model only has one input, so just pick begin()
     auto& input1 = iInput.begin()->second;
-    auto data1 = std::make_shared<TritonInput<float>>();
-    data1->reserve(batchSize_);
-    for (unsigned i = 0; i < batchSize_; ++i) {
-      data1->emplace_back(input1.sizeDims(), 0.5f);
+    auto data1 = input1.allocate<float>();
+    for (auto& vdata1 : *data1) {
+      vdata1.assign(input1.sizeDims(), 0.5f);
     }
     // convert to server format
     input1.toServer(data1);
@@ -51,7 +65,7 @@ class TritonImageProducer : public TritonEDProducer<> {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
     edm::ParameterSetDescription desc;
     TritonClient::fillPSetDescription(desc);
-    desc.add<unsigned>("batchSize", 1);
+    desc.add<int>("batchSize", 1);
     desc.add<unsigned>("topN", 5);
     desc.add<edm::FileInPath>("imageList");
     //to ensure distinct cfi names
@@ -82,7 +96,7 @@ class TritonImageProducer : public TritonEDProducer<> {
     }
   }
 
-  unsigned batchSize_;
+  int batchSize_;
   unsigned topN_;
   std::vector<std::string> imageList_;
 };
diff --git a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py
index b9683cdb9439d..1d669d72aca73 100644
--- a/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py
+++ b/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py
@@ -14,7 +14,10 @@
 options.register("modelName","resnet50_netdef", VarParsing.multiplicity.singleton, VarParsing.varType.string)
 options.register("mode","Async", VarParsing.multiplicity.singleton, VarParsing.varType.string)
 options.register("verbose", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
+options.register("brief", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
 options.register("unittest", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
+options.register("testother", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
+options.register("shm", True, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
 options.register("device","auto", VarParsing.multiplicity.singleton, VarParsing.varType.string)
 options.register("docker", False, VarParsing.multiplicity.singleton, VarParsing.varType.bool)
 options.register("tries", 0, VarParsing.multiplicity.singleton, VarParsing.varType.int)
@@ -89,12 +92,13 @@
                 modelConfigPath = cms.FileInPath("HeterogeneousCore/SonicTriton/data/models/{}/config.pbtxt".format(models[module])),
                 verbose = cms.untracked.bool(options.verbose),
                 allowedTries = cms.untracked.uint32(options.tries),
+                useSharedMemory = cms.untracked.bool(options.shm),
             )
         )
     )
     processModule = getattr(process, module)
     if module=="TritonImageProducer":
-        processModule.batchSize = cms.uint32(1)
+        processModule.batchSize = cms.int32(1)
         processModule.topN = cms.uint32(5)
         processModule.imageList = cms.FileInPath("HeterogeneousCore/SonicTriton/data/models/resnet50_netdef/resnet50_labels.txt")
     elif "TritonGraph" in module:
@@ -109,8 +113,19 @@
             processModule.nodeMax = cms.uint32(4000)
             processModule.edgeMin = cms.uint32(8000)
             processModule.edgeMax = cms.uint32(15000)
+        processModule.brief = cms.bool(options.brief)
     process.p += processModule
     keepMsgs.extend([module,module+':TritonClient'])
+    if options.testother:
+        # clone modules to test both gRPC and shared memory
+        _module2 = module+"GRPC" if processModule.Client.useSharedMemory else "SHM"
+        setattr(process, _module2,
+            processModule.clone(
+                Client = dict(useSharedMemory = not processModule.Client.useSharedMemory)
+            )
+        )
+        processModule2 = getattr(process, _module2)
+        process.p += processModule2
 
 process.load('FWCore/MessageService/MessageLogger_cfi')
 process.MessageLogger.cerr.FwkReport.reportEvery = 500
diff --git a/HeterogeneousCore/SonicTriton/test/unittest.sh b/HeterogeneousCore/SonicTriton/test/unittest.sh
index a428a98a723d8..7ae19df174da1 100755
--- a/HeterogeneousCore/SonicTriton/test/unittest.sh
+++ b/HeterogeneousCore/SonicTriton/test/unittest.sh
@@ -1,18 +1,28 @@
 #!/bin/bash
 
 LOCALTOP=$1
+DEVICE=$2
 
 # the test is not possible if:
-# 1. avx instructions not supported (needed for singularity on CPU)
+# 1. GPU not available (only if GPU test requested) / avx instructions not supported (needed for singularity on CPU)
 # 2. singularity not found or not usable
 # 3. inside singularity container w/o unprivileged user namespace enabled (needed for singularity-in-singularity)
 # so just return true in those cases
 
-if grep -q avx /proc/cpuinfo; then
-	echo "has avx"
+if [ "$DEVICE" = "GPU" ]; then
+	if nvidia-smi -L; then
+		echo "has GPU"
+	else
+		echo "missing GPU"
+		exit 0
+	fi
 else
-	echo "missing avx"
-	exit 0
+	if grep -q avx /proc/cpuinfo; then
+		echo "has avx"
+	else
+		echo "missing avx"
+		exit 0
+	fi
 fi
 
 if type singularity >& /dev/null; then
@@ -32,7 +42,7 @@ if [ -n "$SINGULARITY_CONTAINER" ]; then
 fi
 
 tmpFile=$(mktemp -p ${LOCALTOP} SonicTritonTestXXXXXXXX.log)
-cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=1 unittest=1 verbose=1 >& $tmpFile
+cmsRun ${LOCALTOP}/src/HeterogeneousCore/SonicTriton/test/tritonTest_cfg.py modules=TritonGraphProducer,TritonGraphFilter,TritonGraphAnalyzer maxEvents=2 unittest=1 verbose=1 device=${DEVICE} testother=1 >& $tmpFile
 CMSEXIT=$?
 
 cat $tmpFile