From 604ab67c7f28dc03a82ee7aa1bcc487d39ea6f8c Mon Sep 17 00:00:00 2001 From: Stenzek Date: Mon, 18 Mar 2019 00:09:26 +1000 Subject: [PATCH] Vulkan: Simplify perf queries using vkGetQueryPoolResults --- .../Core/VideoBackends/Vulkan/PerfQuery.cpp | 261 ++++-------------- Source/Core/VideoBackends/Vulkan/PerfQuery.h | 40 ++- Source/Core/VideoBackends/Vulkan/Renderer.cpp | 2 - 3 files changed, 73 insertions(+), 230 deletions(-) diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp index 1934c78410ac..f19b67d12bf6 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp @@ -14,7 +14,6 @@ #include "VideoBackends/Vulkan/CommandBufferManager.h" #include "VideoBackends/Vulkan/Renderer.h" -#include "VideoBackends/Vulkan/StagingBuffer.h" #include "VideoBackends/Vulkan/StateTracker.h" #include "VideoBackends/Vulkan/VulkanContext.h" @@ -28,11 +27,6 @@ PerfQuery::~PerfQuery() vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr); } -Vulkan::PerfQuery* PerfQuery::GetInstance() -{ - return static_cast(g_perf_query.get()); -} - bool PerfQuery::Initialize() { if (!CreateQueryPool()) @@ -41,47 +35,30 @@ bool PerfQuery::Initialize() return false; } - if (!CreateReadbackBuffer()) - { - PanicAlert("Failed to create readback buffer"); - return false; - } - return true; } void PerfQuery::EnableQuery(PerfQueryGroup type) { - // Have we used half of the query buffer already? - if (m_query_count > m_query_buffer.size() / 2) - NonBlockingPartialFlush(); - // Block if there are no free slots. - if (m_query_count == PERF_QUERY_BUFFER_SIZE) - { - // ERROR_LOG(VIDEO, "Flushed query buffer early!"); - BlockingPartialFlush(); - } + // Otherwise, try to keep half of them available. + if (m_query_count > m_query_buffer.size() / 2) + PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE); if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - u32 index = (m_query_read_pos + m_query_count) % PERF_QUERY_BUFFER_SIZE; - ActiveQuery& entry = m_query_buffer[index]; - ASSERT(!entry.active && !entry.available); - entry.active = true; - m_query_count++; - - DEBUG_LOG(VIDEO, "start query %u", index); + ActiveQuery& entry = m_query_buffer[m_query_next_pos]; + DEBUG_ASSERT(!entry.has_value); + entry.has_value = true; // Use precise queries if supported, otherwise boolean (which will be incorrect). - VkQueryControlFlags flags = 0; - if (g_vulkan_context->SupportsPreciseOcclusionQueries()) - flags = VK_QUERY_CONTROL_PRECISE_BIT; + VkQueryControlFlags flags = + g_vulkan_context->SupportsPreciseOcclusionQueries() ? VK_QUERY_CONTROL_PRECISE_BIT : 0; // Ensure the query starts within a render pass. - // TODO: Is this needed? StateTracker::GetInstance()->BeginRenderPass(); - vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index, flags); + vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos, + flags); } } @@ -89,16 +66,17 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) { if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - // DisableQuery should be called for each EnableQuery, so subtract one to get the previous one. - u32 index = (m_query_read_pos + m_query_count - 1) % PERF_QUERY_BUFFER_SIZE; - vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index); + vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos); + m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE; + m_query_count++; } } void PerfQuery::ResetQuery() { m_query_count = 0; - m_query_read_pos = 0; + m_query_readback_pos = 0; + m_query_next_pos = 0; std::fill_n(m_results, ArraySize(m_results), 0); // Reset entire query pool, ensuring all queries are ready to write to. @@ -106,34 +84,20 @@ void PerfQuery::ResetQuery() vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, 0, PERF_QUERY_BUFFER_SIZE); - for (auto& entry : m_query_buffer) - { - entry.fence_counter = 0; - entry.available = false; - entry.active = false; - } + std::memset(m_query_buffer.data(), 0, sizeof(ActiveQuery) * m_query_buffer.size()); } u32 PerfQuery::GetQueryResult(PerfQueryType type) { u32 result = 0; - if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) - { result = m_results[PQG_ZCOMP_ZCOMPLOC]; - } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) - { result = m_results[PQG_ZCOMP]; - } else if (type == PQ_BLEND_INPUT) - { result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; - } else if (type == PQ_EFB_COPY_CLOCKS) - { result = m_results[PQG_EFB_COPY_CLOCKS]; - } return result / 4; } @@ -141,7 +105,7 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) void PerfQuery::FlushResults() { while (!IsFlushed()) - BlockingPartialFlush(); + PartialFlush(true); } bool PerfQuery::IsFlushed() const @@ -170,190 +134,79 @@ bool PerfQuery::CreateQueryPool() return true; } -bool PerfQuery::CreateReadbackBuffer() -{ - m_readback_buffer = StagingBuffer::Create(STAGING_BUFFER_TYPE_READBACK, - PERF_QUERY_BUFFER_SIZE * sizeof(PerfQueryDataType), - VK_BUFFER_USAGE_TRANSFER_DST_BIT); - - // Leave the buffer persistently mapped, we invalidate it when we need to read. - if (!m_readback_buffer || !m_readback_buffer->Map()) - return false; - - return true; -} - -void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count) +void PerfQuery::ReadbackQueries() { - DEBUG_LOG(VIDEO, "queue copy of queries %u-%u", start_index, start_index + query_count - 1); - - // Transition buffer for GPU write - // TODO: Is this needed? - m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(), - VK_ACCESS_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT); - - // Copy from queries -> buffer - vkCmdCopyQueryPoolResults(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, - start_index, query_count, m_readback_buffer->GetBuffer(), - start_index * sizeof(PerfQueryDataType), sizeof(PerfQueryDataType), - VK_QUERY_RESULT_WAIT_BIT); - - // Prepare for host readback - m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(), - VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); - - // Reset queries so they're ready to use again - vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, start_index, - query_count); - - // Flag all queries as available, but with a fence that has to be completed first - for (u32 i = 0; i < query_count; i++) - { - u32 index = start_index + i; - ActiveQuery& entry = m_query_buffer[index]; - entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); - entry.available = true; - entry.active = false; - } -} - -void PerfQuery::FlushQueries() -{ - // Flag all pending queries that aren't available as available after execution. - u32 copy_start_index = 0; - u32 copy_count = 0; - for (u32 i = 0; i < m_query_count; i++) - { - u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; - ActiveQuery& entry = m_query_buffer[index]; - - // Skip already-copied queries (will happen if a flush hasn't occurred and - // a command buffer hasn't finished executing). - if (entry.available) - { - // These should be grouped together, and at the start. - ASSERT(copy_count == 0); - continue; - } - - // If this wrapped around, we need to flush the entries before the end of the buffer. - ASSERT(entry.active); - if (index < copy_start_index) - { - QueueCopyQueryResults(copy_start_index, copy_count); - copy_start_index = index; - copy_count = 0; - } - else if (copy_count == 0) - { - copy_start_index = index; - } - copy_count++; - } - - if (copy_count > 0) - QueueCopyQueryResults(copy_start_index, copy_count); -} - -void PerfQuery::ProcessPendingResults() -{ - const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); + const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter(); // Need to save these since ProcessResults will modify them. - u32 query_read_pos = m_query_read_pos; - u32 query_count = m_query_count; - - // Flush as many queries as are bound to this fence. - u32 flush_start_index = 0; - u32 flush_count = 0; - for (u32 i = 0; i < query_count; i++) + const u32 outstanding_queries = m_query_count; + u32 readback_count = 0; + for (u32 i = 0; i < outstanding_queries; i++) { - u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; - if (m_query_buffer[index].fence_counter > completed_fence_counter) - { - // These should be grouped together, at the end. + u32 index = (m_query_readback_pos + readback_count) % PERF_QUERY_BUFFER_SIZE; + const ActiveQuery& entry = m_query_buffer[index]; + if (entry.fence_counter > completed_fence_counter) break; - } // If this wrapped around, we need to flush the entries before the end of the buffer. - if (index < flush_start_index) - { - ProcessResults(flush_start_index, flush_count); - flush_start_index = index; - flush_count = 0; - } - else if (flush_count == 0) + if (index < m_query_readback_pos) { - flush_start_index = index; + ReadbackQueries(readback_count); + DEBUG_ASSERT(m_query_readback_pos == 0); + readback_count = 0; } - flush_count++; + + readback_count++; } - if (flush_count > 0) - ProcessResults(flush_start_index, flush_count); + if (readback_count > 0) + ReadbackQueries(readback_count); } -void PerfQuery::ProcessResults(u32 start_index, u32 query_count) +void PerfQuery::ReadbackQueries(u32 query_count) { - // Invalidate CPU caches before reading back. - m_readback_buffer->InvalidateCPUCache(start_index * sizeof(PerfQueryDataType), - query_count * sizeof(PerfQueryDataType)); - // Should be at maximum query_count queries pending. - ASSERT(query_count <= m_query_count); - DEBUG_LOG(VIDEO, "process queries %u-%u", start_index, start_index + query_count - 1); + ASSERT(query_count <= m_query_count && + (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE); + + // Read back from the GPU. + VkResult res = + vkGetQueryPoolResults(g_vulkan_context->GetDevice(), m_query_pool, m_query_readback_pos, + query_count, query_count * sizeof(PerfQueryDataType), + m_query_result_buffer.data(), sizeof(PerfQueryDataType), 0); + if (res != VK_SUCCESS) + LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: "); // Remove pending queries. for (u32 i = 0; i < query_count; i++) { - u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; + u32 index = (m_query_readback_pos + i) % PERF_QUERY_BUFFER_SIZE; ActiveQuery& entry = m_query_buffer[index]; // Should have a fence associated with it (waiting for a result). - ASSERT(entry.fence_counter != 0); + DEBUG_ASSERT(entry.fence_counter != 0); entry.fence_counter = 0; - entry.available = false; - entry.active = false; - - // Grab result from readback buffer, it will already have been invalidated. - u32 result; - m_readback_buffer->Read(index * sizeof(PerfQueryDataType), &result, sizeof(result), false); - DEBUG_LOG(VIDEO, " query result %u", result); + entry.has_value = false; // NOTE: Reported pixel metrics should be referenced to native resolution m_results[entry.query_type] += - static_cast(static_cast(result) * EFB_WIDTH / g_renderer->GetTargetWidth() * - EFB_HEIGHT / g_renderer->GetTargetHeight()); + static_cast(static_cast(m_query_result_buffer[i]) * EFB_WIDTH / + g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight()); } - m_query_read_pos = (m_query_read_pos + query_count) % PERF_QUERY_BUFFER_SIZE; + m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE; m_query_count -= query_count; } -void PerfQuery::NonBlockingPartialFlush() +void PerfQuery::PartialFlush(bool blocking) { - if (IsFlushed()) - return; - // Submit a command buffer in the background if the front query is not bound to one. - ActiveQuery& entry = m_query_buffer[m_query_read_pos]; - if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) - Renderer::GetInstance()->ExecuteCommandBuffer(true, false); - - ProcessPendingResults(); -} - -void PerfQuery::BlockingPartialFlush() -{ - if (IsFlushed()) - return; - - // If the first pending query is needing command buffer execution, do that. - ActiveQuery& entry = m_query_buffer[m_query_read_pos]; - if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) - Renderer::GetInstance()->ExecuteCommandBuffer(false, true); + if (blocking || m_query_buffer[m_query_readback_pos].fence_counter == + g_command_buffer_mgr->GetCurrentFenceCounter()) + { + Renderer::GetInstance()->ExecuteCommandBuffer(true, blocking); + } - ProcessPendingResults(); + ReadbackQueries(); } } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h index facbe0dc0f8b..8ca91ac65875 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h @@ -13,18 +13,15 @@ namespace Vulkan { -class StagingBuffer; - class PerfQuery : public PerfQueryBase { public: PerfQuery(); ~PerfQuery(); - static PerfQuery* GetInstance(); + static PerfQuery* GetInstance() { return static_cast(g_perf_query.get()); } bool Initialize(); - void FlushQueries(); void EnableQuery(PerfQueryGroup type) override; void DisableQuery(PerfQueryGroup type) override; @@ -34,35 +31,30 @@ class PerfQuery : public PerfQueryBase bool IsFlushed() const override; private: + // u32 is used for the sample counts. + using PerfQueryDataType = u32; + + // when testing in SMS: 64 was too small, 128 was ok + // TODO: This should be size_t, but the base class uses u32s + static const u32 PERF_QUERY_BUFFER_SIZE = 512; + struct ActiveQuery { u64 fence_counter; PerfQueryType query_type; - bool available; - bool active; + bool has_value; }; bool CreateQueryPool(); - bool CreateReadbackBuffer(); - void QueueCopyQueryResults(u32 start_index, u32 query_count); - void ProcessPendingResults(); - void ProcessResults(u32 start_index, u32 query_count); - - void NonBlockingPartialFlush(); - void BlockingPartialFlush(); + void ReadbackQueries(); + void ReadbackQueries(u32 query_count); + void PartialFlush(bool blocking); - // when testing in SMS: 64 was too small, 128 was ok - // TODO: This should be size_t, but the base class uses u32s - using PerfQueryDataType = u32; - static const u32 PERF_QUERY_BUFFER_SIZE = 512; - std::array m_query_buffer = {}; - u32 m_query_read_pos = 0; - - // TODO: Investigate using pipeline statistics to implement other query types VkQueryPool m_query_pool = VK_NULL_HANDLE; - - // Buffer containing query results. Each query is a u32. - std::unique_ptr m_readback_buffer; + u32 m_query_readback_pos = 0; + u32 m_query_next_pos = 0; + std::array m_query_buffer = {}; + std::array m_query_result_buffer = {}; }; } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp index a4db679ecbbc..fc07c4ce41c1 100644 --- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp +++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp @@ -304,7 +304,6 @@ void Renderer::PresentBackbuffer() { // End drawing to backbuffer StateTracker::GetInstance()->EndRenderPass(); - PerfQuery::GetInstance()->FlushQueries(); // Transition the backbuffer to PRESENT_SRC to ensure all commands drawing // to it have finished before present. @@ -325,7 +324,6 @@ void Renderer::PresentBackbuffer() void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_completion) { StateTracker::GetInstance()->EndRenderPass(); - PerfQuery::GetInstance()->FlushQueries(); g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);