diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs
index ffa90d505e..6cec9e2b21 100644
--- a/wgpu-core/src/command/bundle.rs
+++ b/wgpu-core/src/command/bundle.rs
@@ -816,8 +816,8 @@ fn draw_mesh_tasks(
     let pipeline = state.pipeline()?;
     let used_bind_groups = pipeline.used_bind_groups;
 
-    let groups_size_limit = state.device.limits.max_task_workgroups_per_dimension;
-    let max_groups = state.device.limits.max_task_workgroup_total_count;
+    let groups_size_limit = state.device.limits.max_task_mesh_workgroups_per_dimension;
+    let max_groups = state.device.limits.max_task_mesh_workgroup_total_count;
     if group_count_x > groups_size_limit
         || group_count_y > groups_size_limit
         || group_count_z > groups_size_limit
diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index a60230a461..d5553e9abb 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -2709,8 +2709,13 @@ fn draw_mesh_tasks(
         .base
         .device
         .limits
-        .max_task_workgroups_per_dimension;
-    let max_groups = state.pass.base.device.limits.max_task_workgroup_total_count;
+        .max_task_mesh_workgroups_per_dimension;
+    let max_groups = state
+        .pass
+        .base
+        .device
+        .limits
+        .max_task_mesh_workgroup_total_count;
     if group_count_x > groups_size_limit
         || group_count_y > groups_size_limit
         || group_count_z > groups_size_limit
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index bf29520674..b0a1258c49 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -3972,7 +3972,7 @@ impl Device {
                         self.require_features(wgt::Features::VERTEX_ATTRIBUTE_64BIT)?;
                     }
 
-                    let previous = io.insert(
+                    let previous = io.varyings.insert(
                         attribute.shader_location,
                         validation::InterfaceVar::vertex_attribute(attribute.format),
                     );
@@ -4363,20 +4363,18 @@ impl Device {
                     )
                     .map_err(stage_err)?;
 
-                if validated_stages == wgt::ShaderStages::VERTEX {
-                    if let Some(ref interface) = shader_module.interface {
-                        io = interface
-                            .check_stage(
-                                &mut binding_layout_source,
-                                &mut shader_binding_sizes,
-                                &fragment_entry_point_name,
-                                stage,
-                                io,
-                                desc.depth_stencil.as_ref().map(|d| d.depth_compare),
-                            )
-                            .map_err(stage_err)?;
-                        validated_stages |= stage;
-                    }
+                if let Some(ref interface) = shader_module.interface {
+                    io = interface
+                        .check_stage(
+                            &mut binding_layout_source,
+                            &mut shader_binding_sizes,
+                            &fragment_entry_point_name,
+                            stage,
+                            io,
+                            desc.depth_stencil.as_ref().map(|d| d.depth_compare),
+                        )
+                        .map_err(stage_err)?;
+                    validated_stages |= stage;
                 }
 
                 if let Some(ref interface) = shader_module.interface {
@@ -4412,7 +4410,7 @@ impl Device {
         }
 
         if validated_stages.contains(wgt::ShaderStages::FRAGMENT) {
-            for (i, output) in io.iter() {
+            for (i, output) in io.varyings.iter() {
                 match color_targets.get(*i as usize) {
                     Some(Some(state)) => {
                         validation::check_texture_format(state.format, &output.ty).map_err(
diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs
index ffe2c7e757..df8c555699 100644
--- a/wgpu-core/src/validation.rs
+++ b/wgpu-core/src/validation.rs
@@ -129,6 +129,7 @@ pub struct InterfaceVar {
     pub ty: NumericType,
     interpolation: Option<naga::Interpolation>,
     sampling: Option<naga::Sampling>,
+    per_primitive: bool,
 }
 
 impl InterfaceVar {
@@ -137,6 +138,7 @@ impl InterfaceVar {
             ty: NumericType::from_vertex_format(format),
             interpolation: None,
             sampling: None,
+            per_primitive: false,
         }
     }
 }
@@ -164,6 +166,12 @@ struct SpecializationConstant {
     ty: NumericType,
 }
 
+#[derive(Debug)]
+struct EntryPointMeshInfo {
+    max_vertices: u32,
+    max_primitives: u32,
+}
+
 #[derive(Debug, Default)]
 struct EntryPoint {
     inputs: Vec<Varying>,
@@ -174,6 +182,8 @@ struct EntryPoint {
     sampling_pairs: FastHashSet<(naga::Handle<Resource>, naga::Handle<Resource>)>,
     workgroup_size: [u32; 3],
     dual_source_blending: bool,
+    task_payload_size: Option<u32>,
+    mesh_info: Option<EntryPointMeshInfo>,
 }
 
 #[derive(Debug)]
@@ -260,6 +270,8 @@ pub enum InputError {
     InterpolationMismatch(Option<naga::Interpolation>),
     #[error("Input sampling doesn't match provided {0:?}")]
     SamplingMismatch(Option<naga::Sampling>),
+    #[error("Pipeline input has perprimitive: {expected} but shader declares perprimitive: {}", !expected)]
+    WrongPerPrimitive { expected: bool },
 }
 
 impl WebGpuError for InputError {
@@ -321,6 +333,22 @@ pub enum StageError {
         var: InterfaceVar,
         limit: u32,
     },
+    #[error("Mesh shaders are limited to {limit} output vertices, but the shader has a maximum number of {value}")]
+    TooManyMeshVertices { limit: u32, value: u32 },
+    #[error("Mesh shaders are limited to {limit} output primitives, but the shader has a maximum number of {value}")]
+    TooManyMeshPrimitives { limit: u32, value: u32 },
+    #[error("Mesh or task shaders are limited to {limit} bytes of task payload, but the shader has a task payload of size {value}")]
+    TaskPayloadTooLarge { limit: u32, value: u32 },
+    #[error("Mesh shader's task payload has size {shader:?}, which doesn't match input from previous stage {input:?}")]
+    TaskPayloadMustMatch {
+        input: Option<u32>,
+        shader: Option<u32>,
+    },
+    #[error("Primitive index can only be used in a fragment shader if the preceding shader was a vertex shader or a mesh shader that writes to primitive index.
+    If a mesh shader writes to primitive index, it must be read by the fragment shader.")]
+    PrimitiveIndexError,
+    #[error("Draw id can only be used in a mesh shader if the pipeline has no task shader.")]
+    DrawIdError,
 }
 
 impl WebGpuError for StageError {
@@ -343,7 +371,13 @@ impl WebGpuError for StageError {
             | Self::MissingEntryPoint(..)
             | Self::NoEntryPointFound
             | Self::MultipleEntryPointsFound
-            | Self::ColorAttachmentLocationTooLarge { .. } => return ErrorType::Validation,
+            | Self::ColorAttachmentLocationTooLarge { .. }
+            | Self::TooManyMeshVertices { .. }
+            | Self::TooManyMeshPrimitives { .. }
+            | Self::TaskPayloadTooLarge { .. }
+            | Self::TaskPayloadMustMatch { .. }
+            | Self::PrimitiveIndexError
+            | Self::DrawIdError => return ErrorType::Validation,
         };
         e.webgpu_error_type()
     }
@@ -916,7 +950,18 @@ impl<'a> BindingLayoutSource<'a> {
     }
 }
 
-pub type StageIo = FastHashMap<wgt::ShaderLocation, InterfaceVar>;
+#[derive(Debug, Clone, Default)]
+pub struct StageIo {
+    pub varyings: FastHashMap<wgt::ShaderLocation, InterfaceVar>,
+    /// This must match between mesh & task shaders
+    pub task_payload_size: Option<u32>,
+    /// Fragment shaders cannot input primitive index on mesh shaders that don't output it on DX12.
+    /// Therefore, we track between shader stages if primitive index is written (or if vertex shader
+    /// is used).
+    ///
+    /// This is Some if it was a mesh shader.
+    pub primitive_index: Option<bool>,
+}
 
 impl Interface {
     fn populate(
@@ -963,13 +1008,15 @@ impl Interface {
                 location,
                 interpolation,
                 sampling,
-                .. // second_blend_source
+                per_primitive,
+                blend_src: _,
             }) => Varying::Local {
                 location,
                 iv: InterfaceVar {
                     ty: numeric_ty,
                     interpolation,
                     sampling,
+                    per_primitive,
                 },
             },
             Some(&naga::Binding::BuiltIn(built_in)) => Varying::BuiltIn(built_in),
@@ -1057,6 +1104,32 @@ impl Interface {
             ep.dual_source_blending = info.dual_source_blending;
             ep.workgroup_size = entry_point.workgroup_size;
 
+            if let Some(task_payload) = entry_point.task_payload {
+                ep.task_payload_size = Some(
+                    module.types[module.global_variables[task_payload].ty]
+                        .inner
+                        .size(module.to_ctx()),
+                );
+            }
+            if let Some(ref mesh_info) = entry_point.mesh_info {
+                ep.mesh_info = Some(EntryPointMeshInfo {
+                    max_vertices: mesh_info.max_vertices,
+                    max_primitives: mesh_info.max_primitives,
+                });
+                Self::populate(
+                    &mut ep.outputs,
+                    None,
+                    mesh_info.vertex_output_type,
+                    &module.types,
+                );
+                Self::populate(
+                    &mut ep.outputs,
+                    None,
+                    mesh_info.primitive_output_type,
+                    &module.types,
+                );
+            }
+
             entry_points.insert((entry_point.stage, entry_point.name.clone()), ep);
         }
 
@@ -1117,7 +1190,7 @@ impl Interface {
             Some(some) => some,
             None => return Err(StageError::MissingEntryPoint(pair.1)),
         };
-        let (_stage, entry_point_name) = pair;
+        let (_, entry_point_name) = pair;
 
         // check resources visibility
         for &handle in entry_point.resources.iter() {
@@ -1241,47 +1314,74 @@ impl Interface {
 
         // check workgroup size limits
         if shader_stage.compute_like() {
-            let max_workgroup_size_limits = [
-                self.limits.max_compute_workgroup_size_x,
-                self.limits.max_compute_workgroup_size_y,
-                self.limits.max_compute_workgroup_size_z,
-            ];
+            let (max_workgroup_size_limits, max_workgroup_size_total) = match shader_stage {
+                naga::ShaderStage::Compute => (
+                    [
+                        self.limits.max_compute_workgroup_size_x,
+                        self.limits.max_compute_workgroup_size_y,
+                        self.limits.max_compute_workgroup_size_z,
+                    ],
+                    self.limits.max_compute_invocations_per_workgroup,
+                ),
+                naga::ShaderStage::Task => (
+                    [
+                        self.limits.max_task_invocations_per_dimension,
+                        self.limits.max_task_invocations_per_dimension,
+                        self.limits.max_task_invocations_per_dimension,
+                    ],
+                    self.limits.max_task_invocations_per_workgroup,
+                ),
+                naga::ShaderStage::Mesh => (
+                    [
+                        self.limits.max_mesh_invocations_per_dimension,
+                        self.limits.max_mesh_invocations_per_dimension,
+                        self.limits.max_mesh_invocations_per_dimension,
+                    ],
+                    self.limits.max_mesh_invocations_per_workgroup,
+                ),
+                _ => unreachable!(),
+            };
             let total_invocations = entry_point.workgroup_size.iter().product::<u32>();
 
             if entry_point.workgroup_size.contains(&0)
-                || total_invocations > self.limits.max_compute_invocations_per_workgroup
-                || entry_point.workgroup_size[0] > max_workgroup_size_limits[0]
-                || entry_point.workgroup_size[1] > max_workgroup_size_limits[1]
-                || entry_point.workgroup_size[2] > max_workgroup_size_limits[2]
+                || total_invocations > max_workgroup_size_total
+                || {
+                    entry_point.workgroup_size[0] > max_workgroup_size_limits[0]
+                        || entry_point.workgroup_size[1] > max_workgroup_size_limits[1]
+                        || entry_point.workgroup_size[2] > max_workgroup_size_limits[2]
+                }
             {
                 return Err(StageError::InvalidWorkgroupSize {
                     current: entry_point.workgroup_size,
                     current_total: total_invocations,
                     limit: max_workgroup_size_limits,
-                    total: self.limits.max_compute_invocations_per_workgroup,
+                    total: max_workgroup_size_total,
                 });
             }
         }
 
         let mut inter_stage_components = 0;
+        let mut has_primitive_index = false;
+        let mut has_draw_id = false;
 
         // check inputs compatibility
         for input in entry_point.inputs.iter() {
             match *input {
                 Varying::Local { location, ref iv } => {
-                    let result =
-                        inputs
-                            .get(&location)
-                            .ok_or(InputError::Missing)
-                            .and_then(|provided| {
-                                let (compatible, num_components) = match shader_stage {
+                    let result = inputs
+                        .varyings
+                        .get(&location)
+                        .ok_or(InputError::Missing)
+                        .and_then(|provided| {
+                            let (compatible, num_components, per_primitive_correct) =
+                                match shader_stage {
                                     // For vertex attributes, there are defaults filled out
                                     // by the driver if data is not provided.
                                     naga::ShaderStage::Vertex => {
                                         let is_compatible =
                                             iv.ty.scalar.kind == provided.ty.scalar.kind;
                                         // vertex inputs don't count towards inter-stage
-                                        (is_compatible, 0)
+                                        (is_compatible, 0, !iv.per_primitive)
                                     }
                                     naga::ShaderStage::Fragment => {
                                         if iv.interpolation != provided.interpolation {
@@ -1297,20 +1397,24 @@ impl Interface {
                                         (
                                             iv.ty.is_subtype_of(&provided.ty),
                                             iv.ty.dim.num_components(),
+                                            iv.per_primitive == provided.per_primitive,
                                         )
                                     }
-                                    naga::ShaderStage::Compute => (false, 0),
-                                    // TODO: add validation for these, see https://github.com/gfx-rs/wgpu/issues/8003
-                                    naga::ShaderStage::Task | naga::ShaderStage::Mesh => {
-                                        unreachable!()
-                                    }
+                                    // These can't have varying inputs
+                                    naga::ShaderStage::Compute
+                                    | naga::ShaderStage::Task
+                                    | naga::ShaderStage::Mesh => (false, 0, false),
                                 };
-                                if compatible {
-                                    Ok(num_components)
-                                } else {
-                                    Err(InputError::WrongType(provided.ty))
-                                }
-                            });
+                            if !compatible {
+                                Err(InputError::WrongType(provided.ty))
+                            } else if !per_primitive_correct {
+                                Err(InputError::WrongPerPrimitive {
+                                    expected: provided.per_primitive,
+                                })
+                            } else {
+                                Ok(num_components)
+                            }
+                        });
                     match result {
                         Ok(num_components) => {
                             inter_stage_components += num_components;
@@ -1324,6 +1428,12 @@ impl Interface {
                         }
                     }
                 }
+                Varying::BuiltIn(naga::BuiltIn::PrimitiveIndex) => {
+                    has_primitive_index = true;
+                }
+                Varying::BuiltIn(naga::BuiltIn::DrawID) => {
+                    has_draw_id = true;
+                }
                 Varying::BuiltIn(_) => {}
             }
         }
@@ -1386,6 +1496,51 @@ impl Interface {
             });
         }
 
+        if let Some(ref mesh_info) = entry_point.mesh_info {
+            if mesh_info.max_vertices > self.limits.max_mesh_output_vertices {
+                return Err(StageError::TooManyMeshVertices {
+                    limit: self.limits.max_mesh_output_vertices,
+                    value: mesh_info.max_vertices,
+                });
+            }
+            if mesh_info.max_primitives > self.limits.max_mesh_output_primitives {
+                return Err(StageError::TooManyMeshPrimitives {
+                    limit: self.limits.max_mesh_output_primitives,
+                    value: mesh_info.max_primitives,
+                });
+            }
+        }
+        if let Some(task_payload_size) = entry_point.task_payload_size {
+            if task_payload_size > self.limits.max_task_payload_size {
+                return Err(StageError::TaskPayloadTooLarge {
+                    limit: self.limits.max_task_payload_size,
+                    value: task_payload_size,
+                });
+            }
+        }
+        if shader_stage == naga::ShaderStage::Mesh
+            && entry_point.task_payload_size != inputs.task_payload_size
+        {
+            return Err(StageError::TaskPayloadMustMatch {
+                input: inputs.task_payload_size,
+                shader: entry_point.task_payload_size,
+            });
+        }
+
+        // Fragment shader primitive index is treated like a varying
+        if let Some(primitive_index) = inputs.primitive_index {
+            if primitive_index != has_primitive_index && shader_stage == naga::ShaderStage::Fragment
+            {
+                return Err(StageError::PrimitiveIndexError);
+            }
+        }
+        if shader_stage == naga::ShaderStage::Mesh
+            && inputs.task_payload_size.is_some()
+            && has_draw_id
+        {
+            return Err(StageError::DrawIdError);
+        }
+
         let outputs = entry_point
             .outputs
             .iter()
@@ -1395,7 +1550,15 @@ impl Interface {
             })
             .collect();
 
-        Ok(outputs)
+        Ok(StageIo {
+            task_payload_size: entry_point.task_payload_size,
+            varyings: outputs,
+            primitive_index: if shader_stage == naga::ShaderStage::Mesh {
+                Some(has_primitive_index)
+            } else {
+                None
+            },
+        })
     }
 
     pub fn fragment_uses_dual_source_blending(
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index d19998ab76..ef6b6dd0aa 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -732,18 +732,28 @@ impl super::Adapter {
                     max_non_sampler_bindings: 1_000_000,
 
                     // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#dispatchmesh-api
-                    max_task_workgroup_total_count: 2u32.pow(22),
+                    max_task_mesh_workgroup_total_count: 2u32.pow(22),
                     // Technically it says "64k" but I highly doubt they want 65536 for compute and exactly 64,000 for task workgroups
-                    max_task_workgroups_per_dimension:
+                    max_task_mesh_workgroups_per_dimension:
                         Direct3D12::D3D12_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION,
-                    // Multiview not supported by WGPU yet
+                    // Assume this inherits from compute shaders
+                    max_task_invocations_per_workgroup:
+                        Direct3D12::D3D12_CS_4_X_THREAD_GROUP_MAX_THREADS_PER_GROUP,
+                    max_task_invocations_per_dimension: Direct3D12::D3D12_CS_THREAD_GROUP_MAX_Z,
+                    // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#amplification-shader-and-mesh-shader
+                    max_mesh_invocations_per_workgroup: 128,
+                    max_mesh_invocations_per_dimension: 128,
+
+                    max_task_payload_size: 16384,
+                    max_mesh_output_vertices: 256,
+                    max_mesh_output_primitives: 256,
+                    // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#sv_rendertargetarrayindex-limitations-based-on-queryable-capability
+                    max_mesh_output_layers: if mesh_shader_supported { 8 } else { 0 },
                     max_mesh_multiview_view_count: if mesh_shader_supported {
                         max_multiview_view_count
                     } else {
                         0
                     },
-                    // This seems to be right, and I can't find anything to suggest it would be less than the 2048 provided here
-                    max_mesh_output_layers: Direct3D12::D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION,
 
                     max_blas_primitive_count: if supports_ray_tracing {
                         1 << 29 // 2^29
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 6ee5f71373..0bc5fa91c2 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -803,10 +803,17 @@ impl super::Adapter {
             max_buffer_size: i32::MAX as u64,
             max_non_sampler_bindings: u32::MAX,
 
-            max_task_workgroup_total_count: 0,
-            max_task_workgroups_per_dimension: 0,
-            max_mesh_multiview_view_count: 0,
+            max_task_mesh_workgroup_total_count: 0,
+            max_task_mesh_workgroups_per_dimension: 0,
+            max_task_invocations_per_workgroup: 0,
+            max_task_invocations_per_dimension: 0,
+            max_mesh_invocations_per_workgroup: 0,
+            max_mesh_invocations_per_dimension: 0,
+            max_task_payload_size: 0,
+            max_mesh_output_vertices: 0,
+            max_mesh_output_primitives: 0,
             max_mesh_output_layers: 0,
+            max_mesh_multiview_view_count: 0,
 
             max_blas_primitive_count: 0,
             max_blas_geometry_count: 0,
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index fa3d2fa8d4..091fb1c108 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -1102,10 +1102,17 @@ impl super::PrivateCapabilities {
                 max_buffer_size: self.max_buffer_size,
                 max_non_sampler_bindings: u32::MAX,
 
-                max_task_workgroup_total_count: 0,
-                max_task_workgroups_per_dimension: 0,
-                max_mesh_multiview_view_count: 0,
+                max_task_mesh_workgroup_total_count: 0,
+                max_task_mesh_workgroups_per_dimension: 0,
+                max_task_invocations_per_workgroup: 0,
+                max_task_invocations_per_dimension: 0,
+                max_mesh_invocations_per_workgroup: 0,
+                max_mesh_invocations_per_dimension: 0,
+                max_task_payload_size: 0,
+                max_mesh_output_vertices: 0,
+                max_mesh_output_primitives: 0,
                 max_mesh_output_layers: 0,
+                max_mesh_multiview_view_count: 0,
 
                 max_blas_primitive_count: 0, // When added: 2^28 from https://developer.apple.com/documentation/metal/mtlaccelerationstructureusage/extendedlimits
                 max_blas_geometry_count: 0,  // When added: 2^24
diff --git a/wgpu-hal/src/noop/mod.rs b/wgpu-hal/src/noop/mod.rs
index b6c110df8d..4ef2c3f298 100644
--- a/wgpu-hal/src/noop/mod.rs
+++ b/wgpu-hal/src/noop/mod.rs
@@ -194,10 +194,17 @@ pub const CAPABILITIES: crate::Capabilities = {
             max_push_constant_size: ALLOC_MAX_U32,
             max_non_sampler_bindings: ALLOC_MAX_U32,
 
-            max_task_workgroup_total_count: ALLOC_MAX_U32,
-            max_task_workgroups_per_dimension: ALLOC_MAX_U32,
-            max_mesh_multiview_view_count: ALLOC_MAX_U32,
+            max_task_mesh_workgroup_total_count: ALLOC_MAX_U32,
+            max_task_mesh_workgroups_per_dimension: ALLOC_MAX_U32,
+            max_task_invocations_per_workgroup: ALLOC_MAX_U32,
+            max_task_invocations_per_dimension: ALLOC_MAX_U32,
+            max_mesh_invocations_per_workgroup: ALLOC_MAX_U32,
+            max_mesh_invocations_per_dimension: ALLOC_MAX_U32,
+            max_task_payload_size: ALLOC_MAX_U32,
+            max_mesh_output_vertices: ALLOC_MAX_U32,
+            max_mesh_output_primitives: ALLOC_MAX_U32,
             max_mesh_output_layers: ALLOC_MAX_U32,
+            max_mesh_multiview_view_count: ALLOC_MAX_U32,
 
             max_blas_primitive_count: ALLOC_MAX_U32,
             max_blas_geometry_count: ALLOC_MAX_U32,
diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs
index f890336cff..61cb6ba498 100644
--- a/wgpu-hal/src/vulkan/adapter.rs
+++ b/wgpu-hal/src/vulkan/adapter.rs
@@ -1231,18 +1231,37 @@ impl PhysicalDeviceProperties {
             .min(limits.max_compute_work_group_count[1])
             .min(limits.max_compute_work_group_count[2]);
         let (
-            max_task_workgroup_total_count,
-            max_task_workgroups_per_dimension,
-            max_mesh_multiview_view_count,
+            max_task_mesh_workgroup_total_count,
+            max_task_mesh_workgroups_per_dimension,
+            max_task_invocations_per_workgroup,
+            max_task_invocations_per_dimension,
+            max_mesh_invocations_per_workgroup,
+            max_mesh_invocations_per_dimension,
+            max_task_payload_size,
+            max_mesh_output_vertices,
+            max_mesh_output_primitives,
             max_mesh_output_layers,
+            max_mesh_multiview_view_count,
         ) = match self.mesh_shader {
             Some(m) => (
-                m.max_task_work_group_total_count,
-                m.max_task_work_group_count.into_iter().min().unwrap(),
-                m.max_mesh_multiview_view_count,
+                m.max_task_work_group_total_count
+                    .min(m.max_mesh_work_group_total_count),
+                m.max_task_work_group_count
+                    .into_iter()
+                    .chain(m.max_mesh_work_group_count)
+                    .min()
+                    .unwrap(),
+                m.max_task_work_group_invocations,
+                m.max_task_work_group_size.into_iter().min().unwrap(),
+                m.max_mesh_work_group_invocations,
+                m.max_mesh_work_group_size.into_iter().min().unwrap(),
+                m.max_task_payload_size,
+                m.max_mesh_output_vertices,
+                m.max_mesh_output_primitives,
                 m.max_mesh_output_layers,
+                m.max_mesh_multiview_view_count,
             ),
-            None => (0, 0, 0, 0),
+            None => (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
         };
 
         // Prevent very large buffers on mesa and most android devices, and in all cases
@@ -1358,10 +1377,19 @@ impl PhysicalDeviceProperties {
             max_buffer_size,
             max_non_sampler_bindings: u32::MAX,
 
-            max_task_workgroup_total_count,
-            max_task_workgroups_per_dimension,
-            max_mesh_multiview_view_count,
+            max_task_mesh_workgroup_total_count,
+            max_task_mesh_workgroups_per_dimension,
+            max_task_invocations_per_workgroup,
+            max_task_invocations_per_dimension,
+
+            max_mesh_invocations_per_workgroup,
+            max_mesh_invocations_per_dimension,
+
+            max_task_payload_size,
+            max_mesh_output_vertices,
+            max_mesh_output_primitives,
             max_mesh_output_layers,
+            max_mesh_multiview_view_count,
 
             max_blas_primitive_count,
             max_blas_geometry_count,
diff --git a/wgpu-info/src/human.rs b/wgpu-info/src/human.rs
index a58779254a..81ddbbb85c 100644
--- a/wgpu-info/src/human.rs
+++ b/wgpu-info/src/human.rs
@@ -162,10 +162,18 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize
         max_push_constant_size,
         max_non_sampler_bindings,
 
-        max_task_workgroup_total_count,
-        max_task_workgroups_per_dimension,
-        max_mesh_multiview_view_count: max_mesh_multiview_count,
+        max_task_mesh_workgroup_total_count,
+        max_task_mesh_workgroups_per_dimension,
+        max_task_invocations_per_workgroup,
+        max_task_invocations_per_dimension,
+        max_mesh_invocations_per_workgroup,
+        max_mesh_invocations_per_dimension,
+        
+        max_task_payload_size,
+        max_mesh_output_vertices,
+        max_mesh_output_primitives,
         max_mesh_output_layers,
+        max_mesh_multiview_view_count,
 
         max_blas_primitive_count,
         max_blas_geometry_count,
@@ -210,10 +218,19 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize
     writeln!(output, "\t\t                       Max Compute Workgroup Size Z: {max_compute_workgroup_size_z}")?;
     writeln!(output, "\t\t               Max Compute Workgroups Per Dimension: {max_compute_workgroups_per_dimension}")?;
     
-    writeln!(output, "\t\t                     Max Task Workgroup Total Count: {max_task_workgroup_total_count}")?;
-    writeln!(output, "\t\t                  Max Task Workgroups Per Dimension: {max_task_workgroups_per_dimension}")?;
-    writeln!(output, "\t\t                      Max Mesh Multiview View Count: {max_mesh_multiview_count}")?;
+    writeln!(output, "\t\t                Max Task/Mesh Workgroup Total Count: {max_task_mesh_workgroup_total_count}")?;
+    writeln!(output, "\t\t             Max Task/Mesh Workgroups Per Dimension: {max_task_mesh_workgroups_per_dimension}")?;
+    writeln!(output, "\t\t                 Max Task Invocations Per Workgroup: {max_task_invocations_per_workgroup}")?;
+    writeln!(output, "\t\t                 Max Task Invocations Per Dimension: {max_task_invocations_per_dimension}")?;
+    writeln!(output, "\t\t                 Max Mesh Invocations Per Workgroup: {max_mesh_invocations_per_workgroup}")?;
+    writeln!(output, "\t\t                 Max Mesh Invocations Per Dimension: {max_mesh_invocations_per_dimension}")?;
+
+
+    writeln!(output, "\t\t                              Max Task Payload Size: {max_task_payload_size}")?;
+    writeln!(output, "\t\t                           Max Mesh Output Vertices: {max_mesh_output_vertices}")?;
+    writeln!(output, "\t\t                         Max Mesh Output Primitives: {max_mesh_output_primitives}")?;
     writeln!(output, "\t\t                             Max Mesh Output Layers: {max_mesh_output_layers}")?;
+    writeln!(output, "\t\t                      Max Mesh Multiview View Count: {max_mesh_multiview_view_count}")?;
 
     writeln!(output, "\t\t                           Max BLAS Primitive count: {max_blas_primitive_count}")?;
     writeln!(output, "\t\t                            Max BLAS Geometry count: {max_blas_geometry_count}")?;
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 7021c33f17..0928fc19e8 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -517,10 +517,18 @@ macro_rules! with_limits {
         $macro_name!(max_push_constant_size, Ordering::Less);
         $macro_name!(max_non_sampler_bindings, Ordering::Less);
 
-        $macro_name!(max_task_workgroup_total_count, Ordering::Less);
-        $macro_name!(max_task_workgroups_per_dimension, Ordering::Less);
-        $macro_name!(max_mesh_multiview_view_count, Ordering::Less);
+        $macro_name!(max_task_mesh_workgroup_total_count, Ordering::Less);
+        $macro_name!(max_task_mesh_workgroups_per_dimension, Ordering::Less);
+        $macro_name!(max_task_invocations_per_workgroup, Ordering::Less);
+        $macro_name!(max_task_invocations_per_dimension, Ordering::Less);
+        $macro_name!(max_mesh_invocations_per_workgroup, Ordering::Less);
+        $macro_name!(max_mesh_invocations_per_dimension, Ordering::Less);
+
+        $macro_name!(max_task_payload_size, Ordering::Less);
+        $macro_name!(max_mesh_output_vertices, Ordering::Less);
+        $macro_name!(max_mesh_output_primitives, Ordering::Less);
         $macro_name!(max_mesh_output_layers, Ordering::Less);
+        $macro_name!(max_mesh_multiview_view_count, Ordering::Less);
 
         $macro_name!(max_blas_primitive_count, Ordering::Less);
         $macro_name!(max_blas_geometry_count, Ordering::Less);
@@ -694,14 +702,35 @@ pub struct Limits {
     /// to create many bind groups at the cost of a large up-front allocation at device creation.
     pub max_non_sampler_bindings: u32,
 
-    /// The maximum total value of x*y*z for a given `draw_mesh_tasks` command
-    pub max_task_workgroup_total_count: u32,
+    /// The maximum total value for a `RenderPass::draw_mesh_tasks(x, y, z)` operation.
+    /// Also for task shader outputs. Defaults to 65535. Higher is "better".
+    pub max_task_mesh_workgroup_total_count: u32,
     /// The maximum value for each dimension of a `RenderPass::draw_mesh_tasks(x, y, z)` operation.
-    /// Defaults to 65535. Higher is "better".
-    pub max_task_workgroups_per_dimension: u32,
-    /// The maximum number of layers that can be output from a mesh shader
+    /// Also for task shader outputs. Defaults to 256. Higher is "better".
+    pub max_task_mesh_workgroups_per_dimension: u32,
+    // These are fundamentally different. It is very common for limits on mesh shaders to be much lower,
+    // so as to properly use the hardware, where task shaders are usually just emulated with compute
+    // shaders. Therefore, we should have different limits for mesh vs task shaders.
+    /// Maximum total number of invocations, or threads, per task shader workgroup. Higher is "better".
+    pub max_task_invocations_per_workgroup: u32,
+    /// The maximum value for each dimension of a task shader's workgroup size. Higher is "better".
+    pub max_task_invocations_per_dimension: u32,
+    /// Maximum total number of invocations, or threads, per mesh shader workgroup. Higher is "better".
+    pub max_mesh_invocations_per_workgroup: u32,
+    /// The maximum value for each dimension of a mesh shader's workgroup size. Higher is "better".
+    pub max_mesh_invocations_per_dimension: u32,
+
+    /// The maximum size of the payload passed from task to mesh shader. Higher is "better".
+    pub max_task_payload_size: u32,
+    /// The maximum number of vertices that a mesh shader may output. Higher is "better".
+    pub max_mesh_output_vertices: u32,
+    /// The maximum number of primitives that a mesh shader may output. Higher is "better".
+    pub max_mesh_output_primitives: u32,
+    /// The maximum number of layers that can be output from a mesh shader. Higher is "better".
+    /// See [#8509](https://github.com/gfx-rs/wgpu/issues/8509).
     pub max_mesh_output_layers: u32,
-    /// The maximum number of views that can be used by a mesh shader in multiview rendering
+    /// The maximum number of views that can be used by a mesh shader in multiview rendering.
+    /// Higher is "better".
     pub max_mesh_multiview_view_count: u32,
 
     /// The maximum number of primitive (ex: triangles, aabbs) a BLAS is allowed to have. Requesting
@@ -775,10 +804,17 @@ impl Limits {
     ///     max_subgroup_size: 0,
     ///     max_push_constant_size: 0,
     ///     max_non_sampler_bindings: 1_000_000,
-    ///     max_task_workgroup_total_count: 0,
-    ///     max_task_workgroups_per_dimension: 0,
-    ///     max_mesh_multiview_view_count: 0,
+    ///     max_task_mesh_workgroup_total_count: 0,
+    ///     max_task_mesh_workgroups_per_dimension: 0,
+    ///     max_task_invocations_per_workgroup: 0,
+    ///     max_task_invocations_per_dimension: 0,
+    ///     max_mesh_invocations_per_workgroup: 0,
+    ///     max_mesh_invocations_per_dimension: 0,
+    ///     max_task_payload_size: 0,
+    ///     max_mesh_output_vertices: 0,
+    ///     max_mesh_output_primitives: 0,
     ///     max_mesh_output_layers: 0,
+    ///     max_mesh_multiview_view_count: 0,
     ///     max_blas_primitive_count: 0,
     ///     max_blas_geometry_count: 0,
     ///     max_tlas_instance_count: 0,
@@ -829,10 +865,17 @@ impl Limits {
             max_push_constant_size: 0,
             max_non_sampler_bindings: 1_000_000,
 
-            max_task_workgroup_total_count: 0,
-            max_task_workgroups_per_dimension: 0,
-            max_mesh_multiview_view_count: 0,
+            max_task_mesh_workgroup_total_count: 0,
+            max_task_mesh_workgroups_per_dimension: 0,
+            max_task_invocations_per_workgroup: 0,
+            max_task_invocations_per_dimension: 0,
+            max_mesh_invocations_per_workgroup: 0,
+            max_mesh_invocations_per_dimension: 0,
+            max_task_payload_size: 0,
+            max_mesh_output_vertices: 0,
+            max_mesh_output_primitives: 0,
             max_mesh_output_layers: 0,
+            max_mesh_multiview_view_count: 0,
 
             max_blas_primitive_count: 0,
             max_blas_geometry_count: 0,
@@ -886,10 +929,17 @@ impl Limits {
     ///     max_buffer_size: 256 << 20, // (256 MiB)
     ///     max_non_sampler_bindings: 1_000_000,
     ///
-    ///     max_task_workgroup_total_count: 0,
-    ///     max_task_workgroups_per_dimension: 0,
-    ///     max_mesh_multiview_view_count: 0,
+    ///     max_task_mesh_workgroup_total_count: 0,
+    ///     max_task_mesh_workgroups_per_dimension: 0,
+    ///     max_task_invocations_per_workgroup: 0,
+    ///     max_task_invocations_per_dimension: 0,
+    ///     max_mesh_invocations_per_workgroup: 0,
+    ///     max_mesh_invocations_per_dimension: 0,
+    ///     max_task_payload_size: 0,
+    ///     max_mesh_output_vertices: 0,
+    ///     max_mesh_output_primitives: 0,
     ///     max_mesh_output_layers: 0,
+    ///     max_mesh_multiview_view_count: 0,
     ///
     ///     max_blas_primitive_count: 0,
     ///     max_blas_geometry_count: 0,
@@ -910,11 +960,6 @@ impl Limits {
             max_color_attachments: 4,
             // see: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf#page=7
             max_compute_workgroup_storage_size: 16352,
-
-            max_task_workgroups_per_dimension: 0,
-            max_task_workgroup_total_count: 0,
-            max_mesh_multiview_view_count: 0,
-            max_mesh_output_layers: 0,
             ..Self::defaults()
         }
     }
@@ -963,10 +1008,17 @@ impl Limits {
     ///     max_buffer_size: 256 << 20, // (256 MiB),
     ///     max_non_sampler_bindings: 1_000_000,
     ///
-    ///     max_task_workgroup_total_count: 0,
-    ///     max_task_workgroups_per_dimension: 0,
-    ///     max_mesh_multiview_view_count: 0,
+    ///     max_task_mesh_workgroup_total_count: 0,
+    ///     max_task_mesh_workgroups_per_dimension: 0,
+    ///     max_task_invocations_per_workgroup: 0,
+    ///     max_task_invocations_per_dimension: 0,
+    ///     max_mesh_invocations_per_workgroup: 0,
+    ///     max_mesh_invocations_per_dimension: 0,
+    ///     max_task_payload_size: 0,
+    ///     max_mesh_output_vertices: 0,
+    ///     max_mesh_output_primitives: 0,
     ///     max_mesh_output_layers: 0,
+    ///     max_mesh_multiview_view_count: 0,
     ///
     ///     max_blas_primitive_count: 0,
     ///     max_blas_geometry_count: 0,
@@ -1065,12 +1117,26 @@ impl Limits {
             // Literally just made this up as 256^2 or 2^16.
             // My GPU supports 2^22, and compute shaders don't have this kind of limit.
             // This very likely is never a real limiter
-            max_task_workgroup_total_count: 65536,
-            max_task_workgroups_per_dimension: 256,
-            // llvmpipe reports 0 multiview count, which just means no multiview is allowed
-            max_mesh_multiview_view_count: 0,
+            max_task_mesh_workgroup_total_count: 65536,
+            max_task_mesh_workgroups_per_dimension: 256,
+            // Copied from compute limits, this is low enough that it should be sensible.
+            max_task_invocations_per_workgroup: 256,
+            max_task_invocations_per_dimension: 64,
+
+            // DX12 limitation, revisit for vulkan
+            max_mesh_invocations_per_workgroup: 128,
+            max_mesh_invocations_per_dimension: 128,
+
+            // DX12 specifies this as minimum
+            max_task_payload_size: 16_384,
+            // DX12 limitation, revisit for vulkan
+            max_mesh_output_vertices: 256,
+            max_mesh_output_primitives: 256,
             // llvmpipe once again requires this to be 8. An RTX 3060 supports well over 1024.
+            // Also DX12 vaguely suggests going over this is illegal in some cases.
             max_mesh_output_layers: 8,
+            // llvmpipe reports 0 multiview count, which just means no multiview is allowed
+            max_mesh_multiview_view_count: 0,
             ..self
         }
     }
diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs
index 800820bbb3..fecdaccfe4 100644
--- a/wgpu/src/backend/webgpu.rs
+++ b/wgpu/src/backend/webgpu.rs
@@ -826,8 +826,21 @@ fn map_wgt_limits(limits: webgpu_sys::GpuSupportedLimits) -> wgt::Limits {
         max_non_sampler_bindings: wgt::Limits::default().max_non_sampler_bindings,
         max_inter_stage_shader_components: wgt::Limits::default().max_inter_stage_shader_components,
 
-        max_task_workgroup_total_count: wgt::Limits::default().max_task_workgroup_total_count,
-        max_task_workgroups_per_dimension: wgt::Limits::default().max_task_workgroups_per_dimension,
+        max_task_mesh_workgroup_total_count: wgt::Limits::default()
+            .max_task_mesh_workgroup_total_count,
+        max_task_mesh_workgroups_per_dimension: wgt::Limits::default()
+            .max_task_mesh_workgroups_per_dimension,
+        max_task_invocations_per_workgroup: wgt::Limits::default()
+            .max_task_invocations_per_workgroup,
+        max_task_invocations_per_dimension: wgt::Limits::default()
+            .max_task_invocations_per_dimension,
+        max_mesh_invocations_per_workgroup: wgt::Limits::default()
+            .max_mesh_invocations_per_workgroup,
+        max_mesh_invocations_per_dimension: wgt::Limits::default()
+            .max_mesh_invocations_per_dimension,
+        max_task_payload_size: wgt::Limits::default().max_task_payload_size,
+        max_mesh_output_vertices: wgt::Limits::default().max_mesh_output_vertices,
+        max_mesh_output_primitives: wgt::Limits::default().max_mesh_output_primitives,
         max_mesh_output_layers: wgt::Limits::default().max_mesh_output_layers,
         max_mesh_multiview_view_count: wgt::Limits::default().max_mesh_multiview_view_count,