diff --git a/src/esp/gfx/BackgroundRenderer.cpp b/src/esp/gfx/BackgroundRenderer.cpp
index 04e45f4c5a..69c3faa3ed 100644
--- a/src/esp/gfx/BackgroundRenderer.cpp
+++ b/src/esp/gfx/BackgroundRenderer.cpp
@@ -171,7 +171,7 @@ int BackgroundRenderer::threadRender() {
     if (sensorType == sensor::SensorType::Depth)
       sensor.renderTarget().readFrameDepth(view);
 
-    if (sensorType == sensor::SensorType::Semantic)
+    if (sensorType == sensor::SensorType::Semantic || sensorType == sensor::SensorType::Instance)
       sensor.renderTarget().readFrameObjectId(view);
   }
 
diff --git a/src/esp/gfx/PbrDrawable.cpp b/src/esp/gfx/PbrDrawable.cpp
index a6bb93e352..6271b6602b 100644
--- a/src/esp/gfx/PbrDrawable.cpp
+++ b/src/esp/gfx/PbrDrawable.cpp
@@ -195,6 +195,15 @@ void PbrDrawable::draw(const Mn::Matrix4& transformationMatrix,
     Mn::GL::Renderer::setFrontFace(Mn::GL::Renderer::FrontFace::ClockWise);
   }
 
+  // Pick the semantic id or the instance id based on the camera type
+  int specificID = 0;
+  if(static_cast<RenderCamera&>(camera).getIsInstanceId()) {
+    specificID = node_.getInstanceId();
+  }
+  else {
+    specificID = node_.getSemanticId();
+  }
+
   (*shader_)
       // e.g., semantic mesh has its own per vertex annotation, which has
       // been uploaded to GPU so simply pass 0 to the uniform "objectId" in
@@ -204,7 +213,7 @@ void PbrDrawable::draw(const Mn::Matrix4& transformationMatrix,
                        : ((flags_ & PbrShader::Flag::InstancedObjectId) ==
                                   PbrShader::Flag::InstancedObjectId
                               ? 0
-                              : node_.getSemanticId()))
+                              : specificID))
       .setProjectionMatrix(camera.projectionMatrix())
       .setViewMatrix(camera.cameraMatrix())
       .setModelMatrix(modelMatrix)  // NOT modelview matrix!
diff --git a/src/esp/gfx/RenderCamera.h b/src/esp/gfx/RenderCamera.h
index 70195558cc..18cba91732 100644
--- a/src/esp/gfx/RenderCamera.h
+++ b/src/esp/gfx/RenderCamera.h
@@ -215,6 +215,20 @@ class RenderCamera : public MagnumCamera {
   size_t filterTransforms(DrawableTransforms& drawableTransforms,
                           Flags flags = {});
 
+  /**
+   * @brief Set the flag to infer if the camera should pick semantic id or instance id for the object.
+   *
+   * @param isInstanceId true if picking instance id, false if picking semantic id.
+   */
+  void setIsInstanceId(bool isInstanceId) { isInstanceId_ = isInstanceId; }
+
+  /**
+   * @brief Get the flag of whether the camera should pick semantic id or instance id for the object.
+   *
+   * @return true if picking instance id, false if picking semantic id.
+   */
+  bool getIsInstanceId() const { return isInstanceId_; }
+
   /**
    * @brief if the "immediate" following rendering pass is to use drawable ids
    * as the object ids.
@@ -254,6 +268,7 @@ class RenderCamera : public MagnumCamera {
   Mn::Matrix4 invertedProjectionMatrix;
   size_t previousNumVisibleDrawables_ = 0;
   bool useDrawableIds_ = false;
+  bool isInstanceId_ = false;
   ESP_SMART_POINTERS(RenderCamera)
 };
 
diff --git a/src/esp/gfx/Renderer.cpp b/src/esp/gfx/Renderer.cpp
index 7fbda18e0f..c94e074c8a 100644
--- a/src/esp/gfx/Renderer.cpp
+++ b/src/esp/gfx/Renderer.cpp
@@ -87,7 +87,8 @@ struct Renderer::Impl {
   void draw(sensor::VisualSensor& visualSensor, sim::Simulator& sim) {
     acquireGlContext();
     if (visualSensor.specification()->sensorType ==
-        sensor::SensorType::Semantic) {
+        sensor::SensorType::Semantic || visualSensor.specification()->sensorType ==
+        sensor::SensorType::Instance) {
       ESP_CHECK(sim.semanticSceneGraphExists(),
                 "Renderer::Impl::draw(): SemanticSensor observation requested "
                 "but no SemanticSceneGraph is loaded");
@@ -101,7 +102,7 @@ struct Renderer::Impl {
     acquireGlContext();
     sensor::SensorType& type = visualSensor.specification()->sensorType;
     if (type == sensor::SensorType::Depth ||
-        type == sensor::SensorType::Semantic) {
+        type == sensor::SensorType::Semantic || type == sensor::SensorType::Instance) {
       Mn::GL::Renderer::disable(Mn::GL::Renderer::Feature::DepthTest);
       gfx::RenderTarget& tgt = visualSensor.renderTarget();
       if (!mesh_) {
@@ -112,7 +113,7 @@ struct Renderer::Impl {
       esp::gfx::Renderer::Impl::RendererShaderType rendererShaderType =
           esp::gfx::Renderer::Impl::RendererShaderType::DepthTextureVisualizer;
 
-      if (type == sensor::SensorType::Semantic) {
+      if (type == sensor::SensorType::Semantic || type == sensor::SensorType::Instance) {
         rendererShaderType =
             gfx::Renderer::Impl::RendererShaderType::ObjectIdTextureVisualizer;
       }
@@ -160,7 +161,7 @@ struct Renderer::Impl {
         shader->bindDepthTexture(tgt.getDepthTexture());
 #endif
         shader->setDepthUnprojection(*visualSensor.depthUnprojection());
-      } else if (type == sensor::SensorType::Semantic) {
+      } else if (type == sensor::SensorType::Semantic || type == sensor::SensorType::Instance) {
         shader->bindObjectIdTexture(tgt.getObjectIdTexture());
       }
       if ((colorMapOffset >= 0) && (colorMapScale >= 0)) {
@@ -365,6 +366,13 @@ struct Renderer::Impl {
         renderTargetFlags |= RenderTarget::Flag::ObjectIdAttachment;
         break;
 
+      case sensor::SensorType::Instance:
+        if (bindingFlags & Flag::VisualizeTexture) {
+          renderTargetFlags |= RenderTarget::Flag::RgbaAttachment;
+        }
+        renderTargetFlags |= RenderTarget::Flag::ObjectIdAttachment;
+        break;
+
       default:
         // I need this default, since sensor type list is long, and without
         // default clang-tidy will complain
diff --git a/src/esp/metadata/attributes/ObjectAttributes.h b/src/esp/metadata/attributes/ObjectAttributes.h
index 458509596b..da38d46400 100644
--- a/src/esp/metadata/attributes/ObjectAttributes.h
+++ b/src/esp/metadata/attributes/ObjectAttributes.h
@@ -367,6 +367,20 @@ class ObjectAttributes : public AbstractObjectAttributes {
 
   uint32_t getSemanticId() const { return get<int>("semantic_id"); }
 
+  /**
+   * @brief Set the instance id for the object. The instance id should be unique in each scene for the object.
+   *
+   * @param instance_id the id for the object in the scene (can be the implicit index of the object in the scene config)
+   */
+  void setInstanceId(int instance_id) { set("instance_id", instance_id); }
+
+  /**
+   * @brief Get the instance id for the object.
+   *
+   * @return instance_id the id for the object in the scene
+   */
+  uint32_t getInstanceId() const { return get<int>("instance_id"); }
+
  protected:
   /**
    * @brief Write object-specific values to json object
diff --git a/src/esp/physics/PhysicsManager.cpp b/src/esp/physics/PhysicsManager.cpp
index 33ab8a619e..b76beb2b7e 100644
--- a/src/esp/physics/PhysicsManager.cpp
+++ b/src/esp/physics/PhysicsManager.cpp
@@ -129,6 +129,10 @@ int PhysicsManager::addObjectInstance(
                 << "as specified in object instance attributes.";
     return 0;
   }
+
+  // Set the instance id of the object
+  objAttributes->setInstanceId(objInstAttributes->getID());
+
   // check if an object is being set to be not visible for a particular
   // instance.
   int visSet = objInstAttributes->getIsInstanceVisible();
diff --git a/src/esp/physics/RigidBase.h b/src/esp/physics/RigidBase.h
index 3fe13a9d14..7ccf6f9de4 100644
--- a/src/esp/physics/RigidBase.h
+++ b/src/esp/physics/RigidBase.h
@@ -386,6 +386,22 @@ class RigidBase : public esp::physics::PhysicsObjectBase {
     }
   }
 
+  /**
+   * @brief Get the Instance ID for this object.
+   */
+  int getInstanceId() const { return visualNode_->getInstanceId(); }
+
+  /**
+   * @brief Set the @ref esp::scene::SceneNode::InstanceId_ for all visual nodes
+   * belonging to the object.
+   * @param InstanceId The desired Instance id for the object.
+   */
+  void setInstanceId(uint32_t instanceId) {
+    for (auto* node : visualNodes_) {
+      node->setInstanceId(instanceId);
+    }
+  }
+
   /**
    * @brief Get pointers to this rigid's visual SceneNodes.
    * @return vector of pointers to the rigid's visual scene nodes.
diff --git a/src/esp/physics/RigidObject.cpp b/src/esp/physics/RigidObject.cpp
index f9e55093c3..c0e38a9a2b 100644
--- a/src/esp/physics/RigidObject.cpp
+++ b/src/esp/physics/RigidObject.cpp
@@ -49,6 +49,8 @@ bool RigidObject::finalizeObject() {
 
   // set the visualization semantic id
   setSemanticId(ObjectAttributes->getSemanticId());
+  // Set the visualization instance id
+  setInstanceId(ObjectAttributes->getInstanceId());
 
   // finish object by instancing any dynamics library-specific code required
   return finalizeObject_LibSpecific();
diff --git a/src/esp/scene/SceneNode.h b/src/esp/scene/SceneNode.h
index 480ba5ed26..c3068dccb9 100644
--- a/src/esp/scene/SceneNode.h
+++ b/src/esp/scene/SceneNode.h
@@ -127,6 +127,12 @@ class SceneNode : public MagnumObject,
   //! Sets node semanticId
   virtual void setSemanticId(int semanticId) { semanticId_ = semanticId; }
 
+  //! Returns node instanceId
+  virtual int getInstanceId() const { return instanceId_; }
+
+  //! Sets node instanceId
+  virtual void setInstanceId(int instanceId) { instanceId_ = instanceId; }
+
   Magnum::Vector3 absoluteTranslation() const;
 
   Magnum::Vector3 absoluteTranslation();
@@ -234,6 +240,7 @@ class SceneNode : public MagnumObject,
   //! The semantic category of this node. Used to render attached Drawables with
   //! Semantic sensor when no perVertexObjectIds are present.
   uint32_t semanticId_ = 0;
+  uint32_t instanceId_ = 0;
 
   //! the local bounding box for meshes stored at this node
   Magnum::Range3D meshBB_;
diff --git a/src/esp/sensor/CameraSensor.cpp b/src/esp/sensor/CameraSensor.cpp
index ceb073ea2b..e98be311e2 100644
--- a/src/esp/sensor/CameraSensor.cpp
+++ b/src/esp/sensor/CameraSensor.cpp
@@ -134,7 +134,13 @@ bool CameraSensor::drawObservation(sim::Simulator& sim) {
     flags |= gfx::RenderCamera::Flag::FrustumCulling;
   }
 
-  if (cameraSensorSpec_->sensorType == SensorType::Semantic) {
+  if (cameraSensorSpec_->sensorType == SensorType::Semantic || cameraSensorSpec_->sensorType == SensorType::Instance) {
+    // Set the camera instance Flag to infer if the camera should pick semantic id or instance id based on the camera type
+    if (cameraSensorSpec_->sensorType == SensorType::Semantic) {
+      renderCamera_->setIsInstanceId(false);
+    } else {
+      renderCamera_->setIsInstanceId(true);
+    }
     // TODO: check sim has semantic scene graph
     bool twoSceneGraphs =
         (&sim.getActiveSemanticSceneGraph() != &sim.getActiveSceneGraph());
diff --git a/src/esp/sensor/CubeMapSensorBase.cpp b/src/esp/sensor/CubeMapSensorBase.cpp
index 794d88aeef..813312fb20 100644
--- a/src/esp/sensor/CubeMapSensorBase.cpp
+++ b/src/esp/sensor/CubeMapSensorBase.cpp
@@ -57,6 +57,9 @@ CubeMapSensorBase::CubeMapSensorBase(scene::SceneNode& cameraNode,
     case SensorType::Semantic:
       cubeMapFlags |= gfx::CubeMap::Flag::ObjectIdTexture;
       break;
+    case SensorType::Instance:
+      cubeMapFlags |= gfx::CubeMap::Flag::ObjectIdTexture;
+      break;
     default:
       CORRADE_INTERNAL_ASSERT_UNREACHABLE();
       break;
@@ -80,6 +83,9 @@ CubeMapSensorBase::CubeMapSensorBase(scene::SceneNode& cameraNode,
     case SensorType::Semantic:
       cubeMapShaderBaseFlags_ |= gfx::CubeMapShaderBase::Flag::ObjectIdTexture;
       break;
+    case SensorType::Instance:
+      cubeMapShaderBaseFlags_ |= gfx::CubeMapShaderBase::Flag::ObjectIdTexture;
+      break;
     // sensor type list is too long, have to use default
     default:
       CORRADE_INTERNAL_ASSERT_UNREACHABLE();
@@ -121,7 +127,7 @@ bool CubeMapSensorBase::renderToCubemapTexture(sim::Simulator& sim) {
 
   // generate the cubemap texture
   const char* defaultDrawableGroupName = "";
-  if (cubeMapSensorBaseSpec_->sensorType == SensorType::Semantic) {
+  if (cubeMapSensorBaseSpec_->sensorType == SensorType::Semantic || cubeMapSensorBaseSpec_->sensorType == SensorType::Instance) {
     bool twoSceneGraphs =
         (&sim.getActiveSemanticSceneGraph() != &sim.getActiveSceneGraph());
 
@@ -164,7 +170,8 @@ void CubeMapSensorBase::drawWith(gfx::CubeMapShaderBase& shader) {
     shader.bindDepthTexture(
         cubeMap_->getTexture(gfx::CubeMap::TextureType::Depth));
   }
-  if (cubeMapSensorBaseSpec_->sensorType == SensorType::Semantic) {
+  if (cubeMapSensorBaseSpec_->sensorType == SensorType::Semantic ||
+      cubeMapSensorBaseSpec_->sensorType == SensorType::Instance) {
     shader.bindObjectIdTexture(
         cubeMap_->getTexture(gfx::CubeMap::TextureType::ObjectId));
   }
diff --git a/src/esp/sensor/Sensor.h b/src/esp/sensor/Sensor.h
index b64d53add5..b6653aaedf 100644
--- a/src/esp/sensor/Sensor.h
+++ b/src/esp/sensor/Sensor.h
@@ -33,6 +33,7 @@ enum class SensorType : int32_t {
   Tensor,
   Text,
   Audio,
+  Instance,
   SensorTypeCount,  // add new type above this term!!
 };
 
diff --git a/src/esp/sensor/VisualSensor.cpp b/src/esp/sensor/VisualSensor.cpp
index a53e317b1f..12fb1aee14 100644
--- a/src/esp/sensor/VisualSensor.cpp
+++ b/src/esp/sensor/VisualSensor.cpp
@@ -24,7 +24,7 @@ void VisualSensorSpec::sanityCheck() const {
   SensorSpec::sanityCheck();
   bool isVisualSensor =
       (sensorType == SensorType::Color || sensorType == SensorType::Depth ||
-       sensorType == SensorType::Normal || sensorType == SensorType::Semantic);
+       sensorType == SensorType::Normal || sensorType == SensorType::Semantic || sensorType == SensorType::Instance);
   CORRADE_ASSERT(
       isVisualSensor,
       "VisualSensorSpec::sanityCheck(): sensorType must be Color, Depth, "
@@ -90,7 +90,7 @@ bool VisualSensor::getObservationSpace(ObservationSpace& space) {
                  static_cast<size_t>(visualSensorSpec_->resolution[1]),
                  static_cast<size_t>(visualSensorSpec_->channels)};
   space.dataType = core::DataType::DT_UINT8;
-  if (visualSensorSpec_->sensorType == SensorType::Semantic) {
+  if (visualSensorSpec_->sensorType == SensorType::Semantic || visualSensorSpec_->sensorType == SensorType::Instance) {
     space.dataType = core::DataType::DT_UINT32;
   } else if (visualSensorSpec_->sensorType == SensorType::Depth) {
     space.dataType = core::DataType::DT_FLOAT;
@@ -110,7 +110,7 @@ void VisualSensor::readObservation(Observation& obs) {
 
   // TODO: have different classes for the different types of sensors
   // TODO: do we need to flip axis?
-  if (visualSensorSpec_->sensorType == SensorType::Semantic) {
+  if (visualSensorSpec_->sensorType == SensorType::Semantic || visualSensorSpec_->sensorType == SensorType::Instance) {
     renderTarget().readFrameObjectId(Magnum::MutableImageView2D{
         Magnum::PixelFormat::R32UI, renderTarget().framebufferSize(),
         obs.buffer->data});
@@ -153,7 +153,7 @@ VisualSensor::MoveSemanticSensorNodeHelper::MoveSemanticSensorNodeHelper(
     sim::Simulator& sim)
     : visualSensor_(visualSensor), sim_(sim) {
   CORRADE_INTERNAL_ASSERT(visualSensor_.specification()->sensorType ==
-                          SensorType::Semantic);
+                          SensorType::Semantic || visualSensor_.specification()->sensorType == SensorType::Instance);
   scene::SceneNode& node = visualSensor_.node();
   CORRADE_ASSERT(
       !scene::SceneGraph::isRootNode(node),
@@ -189,7 +189,7 @@ VisualSensor::MoveSemanticSensorNodeHelper::MoveSemanticSensorNodeHelper(
 
 VisualSensor::MoveSemanticSensorNodeHelper::~MoveSemanticSensorNodeHelper() {
   CORRADE_INTERNAL_ASSERT(visualSensor_.specification()->sensorType ==
-                          SensorType::Semantic);
+                          SensorType::Semantic || visualSensor_.specification()->sensorType == SensorType::Instance);
 
   scene::SceneNode& node = visualSensor_.node();
   CORRADE_ASSERT(
diff --git a/src/utils/viewer/viewer.cpp b/src/utils/viewer/viewer.cpp
index e9fca198d3..701941c41f 100644
--- a/src/utils/viewer/viewer.cpp
+++ b/src/utils/viewer/viewer.cpp
@@ -645,6 +645,7 @@ Key Commands:
     RGBA = 0,
     Depth,
     Semantic,
+    Instance,
     VisualizeModeCount,
   };
   VisualizeMode visualizeMode_ = VisualizeMode::RGBA;
@@ -701,7 +702,8 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
                                   : esp::sensor::SensorSubType::Pinhole;
     spec->sensorType = sensorType;
     if (sensorType == esp::sensor::SensorType::Depth ||
-        sensorType == esp::sensor::SensorType::Semantic) {
+        sensorType == esp::sensor::SensorType::Semantic ||
+        sensorType == esp::sensor::SensorType::Instance) {
       spec->channels = 1;
     }
     spec->position = {0.0f, rgbSensorHeight, 0.0f};
@@ -715,6 +717,8 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
   addCameraSensor("depth_camera", esp::sensor::SensorType::Depth);
   // add the camera semantic sensor
   addCameraSensor("semantic_camera", esp::sensor::SensorType::Semantic);
+  // add the camera instance sensor
+  addCameraSensor("instance_camera", esp::sensor::SensorType::Instance);
 
   auto addFisheyeSensor = [&](const std::string& uuid,
                               esp::sensor::SensorType sensorType,
@@ -730,7 +734,8 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
     spec->uuid = uuid;
     spec->sensorType = sensorType;
     if (sensorType == esp::sensor::SensorType::Depth ||
-        sensorType == esp::sensor::SensorType::Semantic) {
+        sensorType == esp::sensor::SensorType::Semantic ||
+        sensorType == esp::sensor::SensorType::Instance) {
       spec->channels = 1;
     }
     spec->sensorSubType = esp::sensor::SensorSubType::Fisheye;
@@ -766,6 +771,9 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
   // add the fisheye semantic sensor
   addFisheyeSensor("semantic_fisheye", esp::sensor::SensorType::Semantic,
                    esp::sensor::FisheyeSensorModelType::DoubleSphere);
+  // add the fisheye instance sensor
+  addFisheyeSensor("instance_fisheye", esp::sensor::SensorType::Instance,
+                   esp::sensor::FisheyeSensorModelType::DoubleSphere);
 
   auto addEquirectangularSensor = [&](const std::string& uuid,
                                       esp::sensor::SensorType sensorType) {
@@ -776,7 +784,8 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
     spec->uuid = uuid;
     spec->sensorType = sensorType;
     if (sensorType == esp::sensor::SensorType::Depth ||
-        sensorType == esp::sensor::SensorType::Semantic) {
+        sensorType == esp::sensor::SensorType::Semantic ||
+        sensorType == esp::sensor::SensorType::Instance) {
       spec->channels = 1;
     }
     spec->sensorSubType = esp::sensor::SensorSubType::Equirectangular;
@@ -791,6 +800,9 @@ void addSensors(esp::agent::AgentConfiguration& agentConfig, bool isOrtho) {
   // add the equirectangular semantic sensor
   addEquirectangularSensor("semantic_equirectangular",
                            esp::sensor::SensorType::Semantic);
+  // add the equirectangular instance sensor
+  addEquirectangularSensor("instance_equirectangular",
+                           esp::sensor::SensorType::Instance);
 
 // add audio sensor
 #ifdef ESP_BUILD_WITH_AUDIO
@@ -1636,6 +1648,10 @@ void Viewer::setSensorVisID() {
       prefix = "semantic";
       break;
     }
+    case VisualizeMode::Instance: {
+      prefix = "instance";
+      break;
+    }
     default:
       CORRADE_INTERNAL_ASSERT_UNREACHABLE();
   }  // switch on visualize mode
@@ -1750,7 +1766,7 @@ void Viewer::drawEvent() {
     }
     sensorRenderTarget->blitRgbaToDefault();
   } else {
-    // Depth Or Semantic, or Non-pinhole RGBA
+    // Depth Or Semantic Or Instance, or Non-pinhole RGBA
     simulator_->drawObservation(defaultAgentId_, sensorVisID_);
 
     esp::gfx::RenderTarget* sensorRenderTarget =
@@ -1763,7 +1779,8 @@ void Viewer::drawEvent() {
       simulator_->visualizeObservation(defaultAgentId_, sensorVisID_,
                                        1.0f / 512.0f,  // colorMapOffset
                                        1.0f / 12.0f);  // colorMapScale
-    } else if (visualizeMode_ == VisualizeMode::Semantic) {
+    } else if (visualizeMode_ == VisualizeMode::Semantic ||
+               visualizeMode_ == VisualizeMode::Instance) {
       Mn::GL::defaultFramebuffer.clear(Mn::GL::FramebufferClear::Color |
                                        Mn::GL::FramebufferClear::Depth);
       simulator_->visualizeObservation(defaultAgentId_, sensorVisID_);
@@ -1925,7 +1942,8 @@ void Viewer::bindRenderTarget() {
       esp::sensor::VisualSensor& visualSensor =
           static_cast<esp::sensor::VisualSensor&>(it.second.get());
       if (visualizeMode_ == VisualizeMode::Depth ||
-          visualizeMode_ == VisualizeMode::Semantic) {
+          visualizeMode_ == VisualizeMode::Semantic ||
+          visualizeMode_ == VisualizeMode::Instance) {
         simulator_->getRenderer()->bindRenderTarget(
             visualSensor, {esp::gfx::Renderer::Flag::VisualizeTexture});
       } else {