diff --git a/wgpu-core/src/command/bundle.rs b/wgpu-core/src/command/bundle.rs index ffa90d505e..6cec9e2b21 100644 --- a/wgpu-core/src/command/bundle.rs +++ b/wgpu-core/src/command/bundle.rs @@ -816,8 +816,8 @@ fn draw_mesh_tasks( let pipeline = state.pipeline()?; let used_bind_groups = pipeline.used_bind_groups; - let groups_size_limit = state.device.limits.max_task_workgroups_per_dimension; - let max_groups = state.device.limits.max_task_workgroup_total_count; + let groups_size_limit = state.device.limits.max_task_mesh_workgroups_per_dimension; + let max_groups = state.device.limits.max_task_mesh_workgroup_total_count; if group_count_x > groups_size_limit || group_count_y > groups_size_limit || group_count_z > groups_size_limit diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs index a60230a461..d5553e9abb 100644 --- a/wgpu-core/src/command/render.rs +++ b/wgpu-core/src/command/render.rs @@ -2709,8 +2709,13 @@ fn draw_mesh_tasks( .base .device .limits - .max_task_workgroups_per_dimension; - let max_groups = state.pass.base.device.limits.max_task_workgroup_total_count; + .max_task_mesh_workgroups_per_dimension; + let max_groups = state + .pass + .base + .device + .limits + .max_task_mesh_workgroup_total_count; if group_count_x > groups_size_limit || group_count_y > groups_size_limit || group_count_z > groups_size_limit diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs index bf29520674..b0a1258c49 100644 --- a/wgpu-core/src/device/resource.rs +++ b/wgpu-core/src/device/resource.rs @@ -3972,7 +3972,7 @@ impl Device { self.require_features(wgt::Features::VERTEX_ATTRIBUTE_64BIT)?; } - let previous = io.insert( + let previous = io.varyings.insert( attribute.shader_location, validation::InterfaceVar::vertex_attribute(attribute.format), ); @@ -4363,20 +4363,18 @@ impl Device { ) .map_err(stage_err)?; - if validated_stages == wgt::ShaderStages::VERTEX { - if let Some(ref interface) = shader_module.interface { - io = interface - .check_stage( - &mut binding_layout_source, - &mut shader_binding_sizes, - &fragment_entry_point_name, - stage, - io, - desc.depth_stencil.as_ref().map(|d| d.depth_compare), - ) - .map_err(stage_err)?; - validated_stages |= stage; - } + if let Some(ref interface) = shader_module.interface { + io = interface + .check_stage( + &mut binding_layout_source, + &mut shader_binding_sizes, + &fragment_entry_point_name, + stage, + io, + desc.depth_stencil.as_ref().map(|d| d.depth_compare), + ) + .map_err(stage_err)?; + validated_stages |= stage; } if let Some(ref interface) = shader_module.interface { @@ -4412,7 +4410,7 @@ impl Device { } if validated_stages.contains(wgt::ShaderStages::FRAGMENT) { - for (i, output) in io.iter() { + for (i, output) in io.varyings.iter() { match color_targets.get(*i as usize) { Some(Some(state)) => { validation::check_texture_format(state.format, &output.ty).map_err( diff --git a/wgpu-core/src/validation.rs b/wgpu-core/src/validation.rs index ffe2c7e757..df8c555699 100644 --- a/wgpu-core/src/validation.rs +++ b/wgpu-core/src/validation.rs @@ -129,6 +129,7 @@ pub struct InterfaceVar { pub ty: NumericType, interpolation: Option, sampling: Option, + per_primitive: bool, } impl InterfaceVar { @@ -137,6 +138,7 @@ impl InterfaceVar { ty: NumericType::from_vertex_format(format), interpolation: None, sampling: None, + per_primitive: false, } } } @@ -164,6 +166,12 @@ struct SpecializationConstant { ty: NumericType, } +#[derive(Debug)] +struct EntryPointMeshInfo { + max_vertices: u32, + max_primitives: u32, +} + #[derive(Debug, Default)] struct EntryPoint { inputs: Vec, @@ -174,6 +182,8 @@ struct EntryPoint { sampling_pairs: FastHashSet<(naga::Handle, naga::Handle)>, workgroup_size: [u32; 3], dual_source_blending: bool, + task_payload_size: Option, + mesh_info: Option, } #[derive(Debug)] @@ -260,6 +270,8 @@ pub enum InputError { InterpolationMismatch(Option), #[error("Input sampling doesn't match provided {0:?}")] SamplingMismatch(Option), + #[error("Pipeline input has perprimitive: {expected} but shader declares perprimitive: {}", !expected)] + WrongPerPrimitive { expected: bool }, } impl WebGpuError for InputError { @@ -321,6 +333,22 @@ pub enum StageError { var: InterfaceVar, limit: u32, }, + #[error("Mesh shaders are limited to {limit} output vertices, but the shader has a maximum number of {value}")] + TooManyMeshVertices { limit: u32, value: u32 }, + #[error("Mesh shaders are limited to {limit} output primitives, but the shader has a maximum number of {value}")] + TooManyMeshPrimitives { limit: u32, value: u32 }, + #[error("Mesh or task shaders are limited to {limit} bytes of task payload, but the shader has a task payload of size {value}")] + TaskPayloadTooLarge { limit: u32, value: u32 }, + #[error("Mesh shader's task payload has size {shader:?}, which doesn't match input from previous stage {input:?}")] + TaskPayloadMustMatch { + input: Option, + shader: Option, + }, + #[error("Primitive index can only be used in a fragment shader if the preceding shader was a vertex shader or a mesh shader that writes to primitive index. + If a mesh shader writes to primitive index, it must be read by the fragment shader.")] + PrimitiveIndexError, + #[error("Draw id can only be used in a mesh shader if the pipeline has no task shader.")] + DrawIdError, } impl WebGpuError for StageError { @@ -343,7 +371,13 @@ impl WebGpuError for StageError { | Self::MissingEntryPoint(..) | Self::NoEntryPointFound | Self::MultipleEntryPointsFound - | Self::ColorAttachmentLocationTooLarge { .. } => return ErrorType::Validation, + | Self::ColorAttachmentLocationTooLarge { .. } + | Self::TooManyMeshVertices { .. } + | Self::TooManyMeshPrimitives { .. } + | Self::TaskPayloadTooLarge { .. } + | Self::TaskPayloadMustMatch { .. } + | Self::PrimitiveIndexError + | Self::DrawIdError => return ErrorType::Validation, }; e.webgpu_error_type() } @@ -916,7 +950,18 @@ impl<'a> BindingLayoutSource<'a> { } } -pub type StageIo = FastHashMap; +#[derive(Debug, Clone, Default)] +pub struct StageIo { + pub varyings: FastHashMap, + /// This must match between mesh & task shaders + pub task_payload_size: Option, + /// Fragment shaders cannot input primitive index on mesh shaders that don't output it on DX12. + /// Therefore, we track between shader stages if primitive index is written (or if vertex shader + /// is used). + /// + /// This is Some if it was a mesh shader. + pub primitive_index: Option, +} impl Interface { fn populate( @@ -963,13 +1008,15 @@ impl Interface { location, interpolation, sampling, - .. // second_blend_source + per_primitive, + blend_src: _, }) => Varying::Local { location, iv: InterfaceVar { ty: numeric_ty, interpolation, sampling, + per_primitive, }, }, Some(&naga::Binding::BuiltIn(built_in)) => Varying::BuiltIn(built_in), @@ -1057,6 +1104,32 @@ impl Interface { ep.dual_source_blending = info.dual_source_blending; ep.workgroup_size = entry_point.workgroup_size; + if let Some(task_payload) = entry_point.task_payload { + ep.task_payload_size = Some( + module.types[module.global_variables[task_payload].ty] + .inner + .size(module.to_ctx()), + ); + } + if let Some(ref mesh_info) = entry_point.mesh_info { + ep.mesh_info = Some(EntryPointMeshInfo { + max_vertices: mesh_info.max_vertices, + max_primitives: mesh_info.max_primitives, + }); + Self::populate( + &mut ep.outputs, + None, + mesh_info.vertex_output_type, + &module.types, + ); + Self::populate( + &mut ep.outputs, + None, + mesh_info.primitive_output_type, + &module.types, + ); + } + entry_points.insert((entry_point.stage, entry_point.name.clone()), ep); } @@ -1117,7 +1190,7 @@ impl Interface { Some(some) => some, None => return Err(StageError::MissingEntryPoint(pair.1)), }; - let (_stage, entry_point_name) = pair; + let (_, entry_point_name) = pair; // check resources visibility for &handle in entry_point.resources.iter() { @@ -1241,47 +1314,74 @@ impl Interface { // check workgroup size limits if shader_stage.compute_like() { - let max_workgroup_size_limits = [ - self.limits.max_compute_workgroup_size_x, - self.limits.max_compute_workgroup_size_y, - self.limits.max_compute_workgroup_size_z, - ]; + let (max_workgroup_size_limits, max_workgroup_size_total) = match shader_stage { + naga::ShaderStage::Compute => ( + [ + self.limits.max_compute_workgroup_size_x, + self.limits.max_compute_workgroup_size_y, + self.limits.max_compute_workgroup_size_z, + ], + self.limits.max_compute_invocations_per_workgroup, + ), + naga::ShaderStage::Task => ( + [ + self.limits.max_task_invocations_per_dimension, + self.limits.max_task_invocations_per_dimension, + self.limits.max_task_invocations_per_dimension, + ], + self.limits.max_task_invocations_per_workgroup, + ), + naga::ShaderStage::Mesh => ( + [ + self.limits.max_mesh_invocations_per_dimension, + self.limits.max_mesh_invocations_per_dimension, + self.limits.max_mesh_invocations_per_dimension, + ], + self.limits.max_mesh_invocations_per_workgroup, + ), + _ => unreachable!(), + }; let total_invocations = entry_point.workgroup_size.iter().product::(); if entry_point.workgroup_size.contains(&0) - || total_invocations > self.limits.max_compute_invocations_per_workgroup - || entry_point.workgroup_size[0] > max_workgroup_size_limits[0] - || entry_point.workgroup_size[1] > max_workgroup_size_limits[1] - || entry_point.workgroup_size[2] > max_workgroup_size_limits[2] + || total_invocations > max_workgroup_size_total + || { + entry_point.workgroup_size[0] > max_workgroup_size_limits[0] + || entry_point.workgroup_size[1] > max_workgroup_size_limits[1] + || entry_point.workgroup_size[2] > max_workgroup_size_limits[2] + } { return Err(StageError::InvalidWorkgroupSize { current: entry_point.workgroup_size, current_total: total_invocations, limit: max_workgroup_size_limits, - total: self.limits.max_compute_invocations_per_workgroup, + total: max_workgroup_size_total, }); } } let mut inter_stage_components = 0; + let mut has_primitive_index = false; + let mut has_draw_id = false; // check inputs compatibility for input in entry_point.inputs.iter() { match *input { Varying::Local { location, ref iv } => { - let result = - inputs - .get(&location) - .ok_or(InputError::Missing) - .and_then(|provided| { - let (compatible, num_components) = match shader_stage { + let result = inputs + .varyings + .get(&location) + .ok_or(InputError::Missing) + .and_then(|provided| { + let (compatible, num_components, per_primitive_correct) = + match shader_stage { // For vertex attributes, there are defaults filled out // by the driver if data is not provided. naga::ShaderStage::Vertex => { let is_compatible = iv.ty.scalar.kind == provided.ty.scalar.kind; // vertex inputs don't count towards inter-stage - (is_compatible, 0) + (is_compatible, 0, !iv.per_primitive) } naga::ShaderStage::Fragment => { if iv.interpolation != provided.interpolation { @@ -1297,20 +1397,24 @@ impl Interface { ( iv.ty.is_subtype_of(&provided.ty), iv.ty.dim.num_components(), + iv.per_primitive == provided.per_primitive, ) } - naga::ShaderStage::Compute => (false, 0), - // TODO: add validation for these, see https://github.com/gfx-rs/wgpu/issues/8003 - naga::ShaderStage::Task | naga::ShaderStage::Mesh => { - unreachable!() - } + // These can't have varying inputs + naga::ShaderStage::Compute + | naga::ShaderStage::Task + | naga::ShaderStage::Mesh => (false, 0, false), }; - if compatible { - Ok(num_components) - } else { - Err(InputError::WrongType(provided.ty)) - } - }); + if !compatible { + Err(InputError::WrongType(provided.ty)) + } else if !per_primitive_correct { + Err(InputError::WrongPerPrimitive { + expected: provided.per_primitive, + }) + } else { + Ok(num_components) + } + }); match result { Ok(num_components) => { inter_stage_components += num_components; @@ -1324,6 +1428,12 @@ impl Interface { } } } + Varying::BuiltIn(naga::BuiltIn::PrimitiveIndex) => { + has_primitive_index = true; + } + Varying::BuiltIn(naga::BuiltIn::DrawID) => { + has_draw_id = true; + } Varying::BuiltIn(_) => {} } } @@ -1386,6 +1496,51 @@ impl Interface { }); } + if let Some(ref mesh_info) = entry_point.mesh_info { + if mesh_info.max_vertices > self.limits.max_mesh_output_vertices { + return Err(StageError::TooManyMeshVertices { + limit: self.limits.max_mesh_output_vertices, + value: mesh_info.max_vertices, + }); + } + if mesh_info.max_primitives > self.limits.max_mesh_output_primitives { + return Err(StageError::TooManyMeshPrimitives { + limit: self.limits.max_mesh_output_primitives, + value: mesh_info.max_primitives, + }); + } + } + if let Some(task_payload_size) = entry_point.task_payload_size { + if task_payload_size > self.limits.max_task_payload_size { + return Err(StageError::TaskPayloadTooLarge { + limit: self.limits.max_task_payload_size, + value: task_payload_size, + }); + } + } + if shader_stage == naga::ShaderStage::Mesh + && entry_point.task_payload_size != inputs.task_payload_size + { + return Err(StageError::TaskPayloadMustMatch { + input: inputs.task_payload_size, + shader: entry_point.task_payload_size, + }); + } + + // Fragment shader primitive index is treated like a varying + if let Some(primitive_index) = inputs.primitive_index { + if primitive_index != has_primitive_index && shader_stage == naga::ShaderStage::Fragment + { + return Err(StageError::PrimitiveIndexError); + } + } + if shader_stage == naga::ShaderStage::Mesh + && inputs.task_payload_size.is_some() + && has_draw_id + { + return Err(StageError::DrawIdError); + } + let outputs = entry_point .outputs .iter() @@ -1395,7 +1550,15 @@ impl Interface { }) .collect(); - Ok(outputs) + Ok(StageIo { + task_payload_size: entry_point.task_payload_size, + varyings: outputs, + primitive_index: if shader_stage == naga::ShaderStage::Mesh { + Some(has_primitive_index) + } else { + None + }, + }) } pub fn fragment_uses_dual_source_blending( diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs index d19998ab76..ef6b6dd0aa 100644 --- a/wgpu-hal/src/dx12/adapter.rs +++ b/wgpu-hal/src/dx12/adapter.rs @@ -732,18 +732,28 @@ impl super::Adapter { max_non_sampler_bindings: 1_000_000, // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#dispatchmesh-api - max_task_workgroup_total_count: 2u32.pow(22), + max_task_mesh_workgroup_total_count: 2u32.pow(22), // Technically it says "64k" but I highly doubt they want 65536 for compute and exactly 64,000 for task workgroups - max_task_workgroups_per_dimension: + max_task_mesh_workgroups_per_dimension: Direct3D12::D3D12_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION, - // Multiview not supported by WGPU yet + // Assume this inherits from compute shaders + max_task_invocations_per_workgroup: + Direct3D12::D3D12_CS_4_X_THREAD_GROUP_MAX_THREADS_PER_GROUP, + max_task_invocations_per_dimension: Direct3D12::D3D12_CS_THREAD_GROUP_MAX_Z, + // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#amplification-shader-and-mesh-shader + max_mesh_invocations_per_workgroup: 128, + max_mesh_invocations_per_dimension: 128, + + max_task_payload_size: 16384, + max_mesh_output_vertices: 256, + max_mesh_output_primitives: 256, + // Source: https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#sv_rendertargetarrayindex-limitations-based-on-queryable-capability + max_mesh_output_layers: if mesh_shader_supported { 8 } else { 0 }, max_mesh_multiview_view_count: if mesh_shader_supported { max_multiview_view_count } else { 0 }, - // This seems to be right, and I can't find anything to suggest it would be less than the 2048 provided here - max_mesh_output_layers: Direct3D12::D3D12_REQ_TEXTURE2D_ARRAY_AXIS_DIMENSION, max_blas_primitive_count: if supports_ray_tracing { 1 << 29 // 2^29 diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs index 6ee5f71373..0bc5fa91c2 100644 --- a/wgpu-hal/src/gles/adapter.rs +++ b/wgpu-hal/src/gles/adapter.rs @@ -803,10 +803,17 @@ impl super::Adapter { max_buffer_size: i32::MAX as u64, max_non_sampler_bindings: u32::MAX, - max_task_workgroup_total_count: 0, - max_task_workgroups_per_dimension: 0, - max_mesh_multiview_view_count: 0, + max_task_mesh_workgroup_total_count: 0, + max_task_mesh_workgroups_per_dimension: 0, + max_task_invocations_per_workgroup: 0, + max_task_invocations_per_dimension: 0, + max_mesh_invocations_per_workgroup: 0, + max_mesh_invocations_per_dimension: 0, + max_task_payload_size: 0, + max_mesh_output_vertices: 0, + max_mesh_output_primitives: 0, max_mesh_output_layers: 0, + max_mesh_multiview_view_count: 0, max_blas_primitive_count: 0, max_blas_geometry_count: 0, diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index fa3d2fa8d4..091fb1c108 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -1102,10 +1102,17 @@ impl super::PrivateCapabilities { max_buffer_size: self.max_buffer_size, max_non_sampler_bindings: u32::MAX, - max_task_workgroup_total_count: 0, - max_task_workgroups_per_dimension: 0, - max_mesh_multiview_view_count: 0, + max_task_mesh_workgroup_total_count: 0, + max_task_mesh_workgroups_per_dimension: 0, + max_task_invocations_per_workgroup: 0, + max_task_invocations_per_dimension: 0, + max_mesh_invocations_per_workgroup: 0, + max_mesh_invocations_per_dimension: 0, + max_task_payload_size: 0, + max_mesh_output_vertices: 0, + max_mesh_output_primitives: 0, max_mesh_output_layers: 0, + max_mesh_multiview_view_count: 0, max_blas_primitive_count: 0, // When added: 2^28 from https://developer.apple.com/documentation/metal/mtlaccelerationstructureusage/extendedlimits max_blas_geometry_count: 0, // When added: 2^24 diff --git a/wgpu-hal/src/noop/mod.rs b/wgpu-hal/src/noop/mod.rs index b6c110df8d..4ef2c3f298 100644 --- a/wgpu-hal/src/noop/mod.rs +++ b/wgpu-hal/src/noop/mod.rs @@ -194,10 +194,17 @@ pub const CAPABILITIES: crate::Capabilities = { max_push_constant_size: ALLOC_MAX_U32, max_non_sampler_bindings: ALLOC_MAX_U32, - max_task_workgroup_total_count: ALLOC_MAX_U32, - max_task_workgroups_per_dimension: ALLOC_MAX_U32, - max_mesh_multiview_view_count: ALLOC_MAX_U32, + max_task_mesh_workgroup_total_count: ALLOC_MAX_U32, + max_task_mesh_workgroups_per_dimension: ALLOC_MAX_U32, + max_task_invocations_per_workgroup: ALLOC_MAX_U32, + max_task_invocations_per_dimension: ALLOC_MAX_U32, + max_mesh_invocations_per_workgroup: ALLOC_MAX_U32, + max_mesh_invocations_per_dimension: ALLOC_MAX_U32, + max_task_payload_size: ALLOC_MAX_U32, + max_mesh_output_vertices: ALLOC_MAX_U32, + max_mesh_output_primitives: ALLOC_MAX_U32, max_mesh_output_layers: ALLOC_MAX_U32, + max_mesh_multiview_view_count: ALLOC_MAX_U32, max_blas_primitive_count: ALLOC_MAX_U32, max_blas_geometry_count: ALLOC_MAX_U32, diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index f890336cff..61cb6ba498 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -1231,18 +1231,37 @@ impl PhysicalDeviceProperties { .min(limits.max_compute_work_group_count[1]) .min(limits.max_compute_work_group_count[2]); let ( - max_task_workgroup_total_count, - max_task_workgroups_per_dimension, - max_mesh_multiview_view_count, + max_task_mesh_workgroup_total_count, + max_task_mesh_workgroups_per_dimension, + max_task_invocations_per_workgroup, + max_task_invocations_per_dimension, + max_mesh_invocations_per_workgroup, + max_mesh_invocations_per_dimension, + max_task_payload_size, + max_mesh_output_vertices, + max_mesh_output_primitives, max_mesh_output_layers, + max_mesh_multiview_view_count, ) = match self.mesh_shader { Some(m) => ( - m.max_task_work_group_total_count, - m.max_task_work_group_count.into_iter().min().unwrap(), - m.max_mesh_multiview_view_count, + m.max_task_work_group_total_count + .min(m.max_mesh_work_group_total_count), + m.max_task_work_group_count + .into_iter() + .chain(m.max_mesh_work_group_count) + .min() + .unwrap(), + m.max_task_work_group_invocations, + m.max_task_work_group_size.into_iter().min().unwrap(), + m.max_mesh_work_group_invocations, + m.max_mesh_work_group_size.into_iter().min().unwrap(), + m.max_task_payload_size, + m.max_mesh_output_vertices, + m.max_mesh_output_primitives, m.max_mesh_output_layers, + m.max_mesh_multiview_view_count, ), - None => (0, 0, 0, 0), + None => (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), }; // Prevent very large buffers on mesa and most android devices, and in all cases @@ -1358,10 +1377,19 @@ impl PhysicalDeviceProperties { max_buffer_size, max_non_sampler_bindings: u32::MAX, - max_task_workgroup_total_count, - max_task_workgroups_per_dimension, - max_mesh_multiview_view_count, + max_task_mesh_workgroup_total_count, + max_task_mesh_workgroups_per_dimension, + max_task_invocations_per_workgroup, + max_task_invocations_per_dimension, + + max_mesh_invocations_per_workgroup, + max_mesh_invocations_per_dimension, + + max_task_payload_size, + max_mesh_output_vertices, + max_mesh_output_primitives, max_mesh_output_layers, + max_mesh_multiview_view_count, max_blas_primitive_count, max_blas_geometry_count, diff --git a/wgpu-info/src/human.rs b/wgpu-info/src/human.rs index a58779254a..81ddbbb85c 100644 --- a/wgpu-info/src/human.rs +++ b/wgpu-info/src/human.rs @@ -162,10 +162,18 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize max_push_constant_size, max_non_sampler_bindings, - max_task_workgroup_total_count, - max_task_workgroups_per_dimension, - max_mesh_multiview_view_count: max_mesh_multiview_count, + max_task_mesh_workgroup_total_count, + max_task_mesh_workgroups_per_dimension, + max_task_invocations_per_workgroup, + max_task_invocations_per_dimension, + max_mesh_invocations_per_workgroup, + max_mesh_invocations_per_dimension, + + max_task_payload_size, + max_mesh_output_vertices, + max_mesh_output_primitives, max_mesh_output_layers, + max_mesh_multiview_view_count, max_blas_primitive_count, max_blas_geometry_count, @@ -210,10 +218,19 @@ fn print_adapter(output: &mut impl io::Write, report: &AdapterReport, idx: usize writeln!(output, "\t\t Max Compute Workgroup Size Z: {max_compute_workgroup_size_z}")?; writeln!(output, "\t\t Max Compute Workgroups Per Dimension: {max_compute_workgroups_per_dimension}")?; - writeln!(output, "\t\t Max Task Workgroup Total Count: {max_task_workgroup_total_count}")?; - writeln!(output, "\t\t Max Task Workgroups Per Dimension: {max_task_workgroups_per_dimension}")?; - writeln!(output, "\t\t Max Mesh Multiview View Count: {max_mesh_multiview_count}")?; + writeln!(output, "\t\t Max Task/Mesh Workgroup Total Count: {max_task_mesh_workgroup_total_count}")?; + writeln!(output, "\t\t Max Task/Mesh Workgroups Per Dimension: {max_task_mesh_workgroups_per_dimension}")?; + writeln!(output, "\t\t Max Task Invocations Per Workgroup: {max_task_invocations_per_workgroup}")?; + writeln!(output, "\t\t Max Task Invocations Per Dimension: {max_task_invocations_per_dimension}")?; + writeln!(output, "\t\t Max Mesh Invocations Per Workgroup: {max_mesh_invocations_per_workgroup}")?; + writeln!(output, "\t\t Max Mesh Invocations Per Dimension: {max_mesh_invocations_per_dimension}")?; + + + writeln!(output, "\t\t Max Task Payload Size: {max_task_payload_size}")?; + writeln!(output, "\t\t Max Mesh Output Vertices: {max_mesh_output_vertices}")?; + writeln!(output, "\t\t Max Mesh Output Primitives: {max_mesh_output_primitives}")?; writeln!(output, "\t\t Max Mesh Output Layers: {max_mesh_output_layers}")?; + writeln!(output, "\t\t Max Mesh Multiview View Count: {max_mesh_multiview_view_count}")?; writeln!(output, "\t\t Max BLAS Primitive count: {max_blas_primitive_count}")?; writeln!(output, "\t\t Max BLAS Geometry count: {max_blas_geometry_count}")?; diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs index 7021c33f17..0928fc19e8 100644 --- a/wgpu-types/src/lib.rs +++ b/wgpu-types/src/lib.rs @@ -517,10 +517,18 @@ macro_rules! with_limits { $macro_name!(max_push_constant_size, Ordering::Less); $macro_name!(max_non_sampler_bindings, Ordering::Less); - $macro_name!(max_task_workgroup_total_count, Ordering::Less); - $macro_name!(max_task_workgroups_per_dimension, Ordering::Less); - $macro_name!(max_mesh_multiview_view_count, Ordering::Less); + $macro_name!(max_task_mesh_workgroup_total_count, Ordering::Less); + $macro_name!(max_task_mesh_workgroups_per_dimension, Ordering::Less); + $macro_name!(max_task_invocations_per_workgroup, Ordering::Less); + $macro_name!(max_task_invocations_per_dimension, Ordering::Less); + $macro_name!(max_mesh_invocations_per_workgroup, Ordering::Less); + $macro_name!(max_mesh_invocations_per_dimension, Ordering::Less); + + $macro_name!(max_task_payload_size, Ordering::Less); + $macro_name!(max_mesh_output_vertices, Ordering::Less); + $macro_name!(max_mesh_output_primitives, Ordering::Less); $macro_name!(max_mesh_output_layers, Ordering::Less); + $macro_name!(max_mesh_multiview_view_count, Ordering::Less); $macro_name!(max_blas_primitive_count, Ordering::Less); $macro_name!(max_blas_geometry_count, Ordering::Less); @@ -694,14 +702,35 @@ pub struct Limits { /// to create many bind groups at the cost of a large up-front allocation at device creation. pub max_non_sampler_bindings: u32, - /// The maximum total value of x*y*z for a given `draw_mesh_tasks` command - pub max_task_workgroup_total_count: u32, + /// The maximum total value for a `RenderPass::draw_mesh_tasks(x, y, z)` operation. + /// Also for task shader outputs. Defaults to 65535. Higher is "better". + pub max_task_mesh_workgroup_total_count: u32, /// The maximum value for each dimension of a `RenderPass::draw_mesh_tasks(x, y, z)` operation. - /// Defaults to 65535. Higher is "better". - pub max_task_workgroups_per_dimension: u32, - /// The maximum number of layers that can be output from a mesh shader + /// Also for task shader outputs. Defaults to 256. Higher is "better". + pub max_task_mesh_workgroups_per_dimension: u32, + // These are fundamentally different. It is very common for limits on mesh shaders to be much lower, + // so as to properly use the hardware, where task shaders are usually just emulated with compute + // shaders. Therefore, we should have different limits for mesh vs task shaders. + /// Maximum total number of invocations, or threads, per task shader workgroup. Higher is "better". + pub max_task_invocations_per_workgroup: u32, + /// The maximum value for each dimension of a task shader's workgroup size. Higher is "better". + pub max_task_invocations_per_dimension: u32, + /// Maximum total number of invocations, or threads, per mesh shader workgroup. Higher is "better". + pub max_mesh_invocations_per_workgroup: u32, + /// The maximum value for each dimension of a mesh shader's workgroup size. Higher is "better". + pub max_mesh_invocations_per_dimension: u32, + + /// The maximum size of the payload passed from task to mesh shader. Higher is "better". + pub max_task_payload_size: u32, + /// The maximum number of vertices that a mesh shader may output. Higher is "better". + pub max_mesh_output_vertices: u32, + /// The maximum number of primitives that a mesh shader may output. Higher is "better". + pub max_mesh_output_primitives: u32, + /// The maximum number of layers that can be output from a mesh shader. Higher is "better". + /// See [#8509](https://github.com/gfx-rs/wgpu/issues/8509). pub max_mesh_output_layers: u32, - /// The maximum number of views that can be used by a mesh shader in multiview rendering + /// The maximum number of views that can be used by a mesh shader in multiview rendering. + /// Higher is "better". pub max_mesh_multiview_view_count: u32, /// The maximum number of primitive (ex: triangles, aabbs) a BLAS is allowed to have. Requesting @@ -775,10 +804,17 @@ impl Limits { /// max_subgroup_size: 0, /// max_push_constant_size: 0, /// max_non_sampler_bindings: 1_000_000, - /// max_task_workgroup_total_count: 0, - /// max_task_workgroups_per_dimension: 0, - /// max_mesh_multiview_view_count: 0, + /// max_task_mesh_workgroup_total_count: 0, + /// max_task_mesh_workgroups_per_dimension: 0, + /// max_task_invocations_per_workgroup: 0, + /// max_task_invocations_per_dimension: 0, + /// max_mesh_invocations_per_workgroup: 0, + /// max_mesh_invocations_per_dimension: 0, + /// max_task_payload_size: 0, + /// max_mesh_output_vertices: 0, + /// max_mesh_output_primitives: 0, /// max_mesh_output_layers: 0, + /// max_mesh_multiview_view_count: 0, /// max_blas_primitive_count: 0, /// max_blas_geometry_count: 0, /// max_tlas_instance_count: 0, @@ -829,10 +865,17 @@ impl Limits { max_push_constant_size: 0, max_non_sampler_bindings: 1_000_000, - max_task_workgroup_total_count: 0, - max_task_workgroups_per_dimension: 0, - max_mesh_multiview_view_count: 0, + max_task_mesh_workgroup_total_count: 0, + max_task_mesh_workgroups_per_dimension: 0, + max_task_invocations_per_workgroup: 0, + max_task_invocations_per_dimension: 0, + max_mesh_invocations_per_workgroup: 0, + max_mesh_invocations_per_dimension: 0, + max_task_payload_size: 0, + max_mesh_output_vertices: 0, + max_mesh_output_primitives: 0, max_mesh_output_layers: 0, + max_mesh_multiview_view_count: 0, max_blas_primitive_count: 0, max_blas_geometry_count: 0, @@ -886,10 +929,17 @@ impl Limits { /// max_buffer_size: 256 << 20, // (256 MiB) /// max_non_sampler_bindings: 1_000_000, /// - /// max_task_workgroup_total_count: 0, - /// max_task_workgroups_per_dimension: 0, - /// max_mesh_multiview_view_count: 0, + /// max_task_mesh_workgroup_total_count: 0, + /// max_task_mesh_workgroups_per_dimension: 0, + /// max_task_invocations_per_workgroup: 0, + /// max_task_invocations_per_dimension: 0, + /// max_mesh_invocations_per_workgroup: 0, + /// max_mesh_invocations_per_dimension: 0, + /// max_task_payload_size: 0, + /// max_mesh_output_vertices: 0, + /// max_mesh_output_primitives: 0, /// max_mesh_output_layers: 0, + /// max_mesh_multiview_view_count: 0, /// /// max_blas_primitive_count: 0, /// max_blas_geometry_count: 0, @@ -910,11 +960,6 @@ impl Limits { max_color_attachments: 4, // see: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf#page=7 max_compute_workgroup_storage_size: 16352, - - max_task_workgroups_per_dimension: 0, - max_task_workgroup_total_count: 0, - max_mesh_multiview_view_count: 0, - max_mesh_output_layers: 0, ..Self::defaults() } } @@ -963,10 +1008,17 @@ impl Limits { /// max_buffer_size: 256 << 20, // (256 MiB), /// max_non_sampler_bindings: 1_000_000, /// - /// max_task_workgroup_total_count: 0, - /// max_task_workgroups_per_dimension: 0, - /// max_mesh_multiview_view_count: 0, + /// max_task_mesh_workgroup_total_count: 0, + /// max_task_mesh_workgroups_per_dimension: 0, + /// max_task_invocations_per_workgroup: 0, + /// max_task_invocations_per_dimension: 0, + /// max_mesh_invocations_per_workgroup: 0, + /// max_mesh_invocations_per_dimension: 0, + /// max_task_payload_size: 0, + /// max_mesh_output_vertices: 0, + /// max_mesh_output_primitives: 0, /// max_mesh_output_layers: 0, + /// max_mesh_multiview_view_count: 0, /// /// max_blas_primitive_count: 0, /// max_blas_geometry_count: 0, @@ -1065,12 +1117,26 @@ impl Limits { // Literally just made this up as 256^2 or 2^16. // My GPU supports 2^22, and compute shaders don't have this kind of limit. // This very likely is never a real limiter - max_task_workgroup_total_count: 65536, - max_task_workgroups_per_dimension: 256, - // llvmpipe reports 0 multiview count, which just means no multiview is allowed - max_mesh_multiview_view_count: 0, + max_task_mesh_workgroup_total_count: 65536, + max_task_mesh_workgroups_per_dimension: 256, + // Copied from compute limits, this is low enough that it should be sensible. + max_task_invocations_per_workgroup: 256, + max_task_invocations_per_dimension: 64, + + // DX12 limitation, revisit for vulkan + max_mesh_invocations_per_workgroup: 128, + max_mesh_invocations_per_dimension: 128, + + // DX12 specifies this as minimum + max_task_payload_size: 16_384, + // DX12 limitation, revisit for vulkan + max_mesh_output_vertices: 256, + max_mesh_output_primitives: 256, // llvmpipe once again requires this to be 8. An RTX 3060 supports well over 1024. + // Also DX12 vaguely suggests going over this is illegal in some cases. max_mesh_output_layers: 8, + // llvmpipe reports 0 multiview count, which just means no multiview is allowed + max_mesh_multiview_view_count: 0, ..self } } diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs index 800820bbb3..fecdaccfe4 100644 --- a/wgpu/src/backend/webgpu.rs +++ b/wgpu/src/backend/webgpu.rs @@ -826,8 +826,21 @@ fn map_wgt_limits(limits: webgpu_sys::GpuSupportedLimits) -> wgt::Limits { max_non_sampler_bindings: wgt::Limits::default().max_non_sampler_bindings, max_inter_stage_shader_components: wgt::Limits::default().max_inter_stage_shader_components, - max_task_workgroup_total_count: wgt::Limits::default().max_task_workgroup_total_count, - max_task_workgroups_per_dimension: wgt::Limits::default().max_task_workgroups_per_dimension, + max_task_mesh_workgroup_total_count: wgt::Limits::default() + .max_task_mesh_workgroup_total_count, + max_task_mesh_workgroups_per_dimension: wgt::Limits::default() + .max_task_mesh_workgroups_per_dimension, + max_task_invocations_per_workgroup: wgt::Limits::default() + .max_task_invocations_per_workgroup, + max_task_invocations_per_dimension: wgt::Limits::default() + .max_task_invocations_per_dimension, + max_mesh_invocations_per_workgroup: wgt::Limits::default() + .max_mesh_invocations_per_workgroup, + max_mesh_invocations_per_dimension: wgt::Limits::default() + .max_mesh_invocations_per_dimension, + max_task_payload_size: wgt::Limits::default().max_task_payload_size, + max_mesh_output_vertices: wgt::Limits::default().max_mesh_output_vertices, + max_mesh_output_primitives: wgt::Limits::default().max_mesh_output_primitives, max_mesh_output_layers: wgt::Limits::default().max_mesh_output_layers, max_mesh_multiview_view_count: wgt::Limits::default().max_mesh_multiview_view_count,