From 604ab67c7f28dc03a82ee7aa1bcc487d39ea6f8c Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Mon, 18 Mar 2019 00:09:26 +1000
Subject: [PATCH] Vulkan: Simplify perf queries using vkGetQueryPoolResults

---
 .../Core/VideoBackends/Vulkan/PerfQuery.cpp   | 261 ++++--------------
 Source/Core/VideoBackends/Vulkan/PerfQuery.h  |  40 ++-
 Source/Core/VideoBackends/Vulkan/Renderer.cpp |   2 -
 3 files changed, 73 insertions(+), 230 deletions(-)

diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
index 1934c78410ac..f19b67d12bf6 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
@@ -14,7 +14,6 @@
 
 #include "VideoBackends/Vulkan/CommandBufferManager.h"
 #include "VideoBackends/Vulkan/Renderer.h"
-#include "VideoBackends/Vulkan/StagingBuffer.h"
 #include "VideoBackends/Vulkan/StateTracker.h"
 #include "VideoBackends/Vulkan/VulkanContext.h"
 
@@ -28,11 +27,6 @@ PerfQuery::~PerfQuery()
     vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr);
 }
 
-Vulkan::PerfQuery* PerfQuery::GetInstance()
-{
-  return static_cast<PerfQuery*>(g_perf_query.get());
-}
-
 bool PerfQuery::Initialize()
 {
   if (!CreateQueryPool())
@@ -41,47 +35,30 @@ bool PerfQuery::Initialize()
     return false;
   }
 
-  if (!CreateReadbackBuffer())
-  {
-    PanicAlert("Failed to create readback buffer");
-    return false;
-  }
-
   return true;
 }
 
 void PerfQuery::EnableQuery(PerfQueryGroup type)
 {
-  // Have we used half of the query buffer already?
-  if (m_query_count > m_query_buffer.size() / 2)
-    NonBlockingPartialFlush();
-
   // Block if there are no free slots.
-  if (m_query_count == PERF_QUERY_BUFFER_SIZE)
-  {
-    // ERROR_LOG(VIDEO, "Flushed query buffer early!");
-    BlockingPartialFlush();
-  }
+  // Otherwise, try to keep half of them available.
+  if (m_query_count > m_query_buffer.size() / 2)
+    PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE);
 
   if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
   {
-    u32 index = (m_query_read_pos + m_query_count) % PERF_QUERY_BUFFER_SIZE;
-    ActiveQuery& entry = m_query_buffer[index];
-    ASSERT(!entry.active && !entry.available);
-    entry.active = true;
-    m_query_count++;
-
-    DEBUG_LOG(VIDEO, "start query %u", index);
+    ActiveQuery& entry = m_query_buffer[m_query_next_pos];
+    DEBUG_ASSERT(!entry.has_value);
+    entry.has_value = true;
 
     // Use precise queries if supported, otherwise boolean (which will be incorrect).
-    VkQueryControlFlags flags = 0;
-    if (g_vulkan_context->SupportsPreciseOcclusionQueries())
-      flags = VK_QUERY_CONTROL_PRECISE_BIT;
+    VkQueryControlFlags flags =
+        g_vulkan_context->SupportsPreciseOcclusionQueries() ? VK_QUERY_CONTROL_PRECISE_BIT : 0;
 
     // Ensure the query starts within a render pass.
-    // TODO: Is this needed?
     StateTracker::GetInstance()->BeginRenderPass();
-    vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index, flags);
+    vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos,
+                    flags);
   }
 }
 
@@ -89,16 +66,17 @@ void PerfQuery::DisableQuery(PerfQueryGroup type)
 {
   if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
   {
-    // DisableQuery should be called for each EnableQuery, so subtract one to get the previous one.
-    u32 index = (m_query_read_pos + m_query_count - 1) % PERF_QUERY_BUFFER_SIZE;
-    vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index);
+    vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos);
+    m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE;
+    m_query_count++;
   }
 }
 
 void PerfQuery::ResetQuery()
 {
   m_query_count = 0;
-  m_query_read_pos = 0;
+  m_query_readback_pos = 0;
+  m_query_next_pos = 0;
   std::fill_n(m_results, ArraySize(m_results), 0);
 
   // Reset entire query pool, ensuring all queries are ready to write to.
@@ -106,34 +84,20 @@ void PerfQuery::ResetQuery()
   vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, 0,
                       PERF_QUERY_BUFFER_SIZE);
 
-  for (auto& entry : m_query_buffer)
-  {
-    entry.fence_counter = 0;
-    entry.available = false;
-    entry.active = false;
-  }
+  std::memset(m_query_buffer.data(), 0, sizeof(ActiveQuery) * m_query_buffer.size());
 }
 
 u32 PerfQuery::GetQueryResult(PerfQueryType type)
 {
   u32 result = 0;
-
   if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
-  {
     result = m_results[PQG_ZCOMP_ZCOMPLOC];
-  }
   else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
-  {
     result = m_results[PQG_ZCOMP];
-  }
   else if (type == PQ_BLEND_INPUT)
-  {
     result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
-  }
   else if (type == PQ_EFB_COPY_CLOCKS)
-  {
     result = m_results[PQG_EFB_COPY_CLOCKS];
-  }
 
   return result / 4;
 }
@@ -141,7 +105,7 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type)
 void PerfQuery::FlushResults()
 {
   while (!IsFlushed())
-    BlockingPartialFlush();
+    PartialFlush(true);
 }
 
 bool PerfQuery::IsFlushed() const
@@ -170,190 +134,79 @@ bool PerfQuery::CreateQueryPool()
   return true;
 }
 
-bool PerfQuery::CreateReadbackBuffer()
-{
-  m_readback_buffer = StagingBuffer::Create(STAGING_BUFFER_TYPE_READBACK,
-                                            PERF_QUERY_BUFFER_SIZE * sizeof(PerfQueryDataType),
-                                            VK_BUFFER_USAGE_TRANSFER_DST_BIT);
-
-  // Leave the buffer persistently mapped, we invalidate it when we need to read.
-  if (!m_readback_buffer || !m_readback_buffer->Map())
-    return false;
-
-  return true;
-}
-
-void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count)
+void PerfQuery::ReadbackQueries()
 {
-  DEBUG_LOG(VIDEO, "queue copy of queries %u-%u", start_index, start_index + query_count - 1);
-
-  // Transition buffer for GPU write
-  // TODO: Is this needed?
-  m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(),
-                                        VK_ACCESS_TRANSFER_WRITE_BIT,
-                                        VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-  // Copy from queries -> buffer
-  vkCmdCopyQueryPoolResults(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool,
-                            start_index, query_count, m_readback_buffer->GetBuffer(),
-                            start_index * sizeof(PerfQueryDataType), sizeof(PerfQueryDataType),
-                            VK_QUERY_RESULT_WAIT_BIT);
-
-  // Prepare for host readback
-  m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(),
-                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-  // Reset queries so they're ready to use again
-  vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, start_index,
-                      query_count);
-
-  // Flag all queries as available, but with a fence that has to be completed first
-  for (u32 i = 0; i < query_count; i++)
-  {
-    u32 index = start_index + i;
-    ActiveQuery& entry = m_query_buffer[index];
-    entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
-    entry.available = true;
-    entry.active = false;
-  }
-}
-
-void PerfQuery::FlushQueries()
-{
-  // Flag all pending queries that aren't available as available after execution.
-  u32 copy_start_index = 0;
-  u32 copy_count = 0;
-  for (u32 i = 0; i < m_query_count; i++)
-  {
-    u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
-    ActiveQuery& entry = m_query_buffer[index];
-
-    // Skip already-copied queries (will happen if a flush hasn't occurred and
-    // a command buffer hasn't finished executing).
-    if (entry.available)
-    {
-      // These should be grouped together, and at the start.
-      ASSERT(copy_count == 0);
-      continue;
-    }
-
-    // If this wrapped around, we need to flush the entries before the end of the buffer.
-    ASSERT(entry.active);
-    if (index < copy_start_index)
-    {
-      QueueCopyQueryResults(copy_start_index, copy_count);
-      copy_start_index = index;
-      copy_count = 0;
-    }
-    else if (copy_count == 0)
-    {
-      copy_start_index = index;
-    }
-    copy_count++;
-  }
-
-  if (copy_count > 0)
-    QueueCopyQueryResults(copy_start_index, copy_count);
-}
-
-void PerfQuery::ProcessPendingResults()
-{
-  const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
+  const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter();
 
   // Need to save these since ProcessResults will modify them.
-  u32 query_read_pos = m_query_read_pos;
-  u32 query_count = m_query_count;
-
-  // Flush as many queries as are bound to this fence.
-  u32 flush_start_index = 0;
-  u32 flush_count = 0;
-  for (u32 i = 0; i < query_count; i++)
+  const u32 outstanding_queries = m_query_count;
+  u32 readback_count = 0;
+  for (u32 i = 0; i < outstanding_queries; i++)
   {
-    u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
-    if (m_query_buffer[index].fence_counter > completed_fence_counter)
-    {
-      // These should be grouped together, at the end.
+    u32 index = (m_query_readback_pos + readback_count) % PERF_QUERY_BUFFER_SIZE;
+    const ActiveQuery& entry = m_query_buffer[index];
+    if (entry.fence_counter > completed_fence_counter)
       break;
-    }
 
     // If this wrapped around, we need to flush the entries before the end of the buffer.
-    if (index < flush_start_index)
-    {
-      ProcessResults(flush_start_index, flush_count);
-      flush_start_index = index;
-      flush_count = 0;
-    }
-    else if (flush_count == 0)
+    if (index < m_query_readback_pos)
     {
-      flush_start_index = index;
+      ReadbackQueries(readback_count);
+      DEBUG_ASSERT(m_query_readback_pos == 0);
+      readback_count = 0;
     }
-    flush_count++;
+
+    readback_count++;
   }
 
-  if (flush_count > 0)
-    ProcessResults(flush_start_index, flush_count);
+  if (readback_count > 0)
+    ReadbackQueries(readback_count);
 }
 
-void PerfQuery::ProcessResults(u32 start_index, u32 query_count)
+void PerfQuery::ReadbackQueries(u32 query_count)
 {
-  // Invalidate CPU caches before reading back.
-  m_readback_buffer->InvalidateCPUCache(start_index * sizeof(PerfQueryDataType),
-                                        query_count * sizeof(PerfQueryDataType));
-
   // Should be at maximum query_count queries pending.
-  ASSERT(query_count <= m_query_count);
-  DEBUG_LOG(VIDEO, "process queries %u-%u", start_index, start_index + query_count - 1);
+  ASSERT(query_count <= m_query_count &&
+         (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE);
+
+  // Read back from the GPU.
+  VkResult res =
+      vkGetQueryPoolResults(g_vulkan_context->GetDevice(), m_query_pool, m_query_readback_pos,
+                            query_count, query_count * sizeof(PerfQueryDataType),
+                            m_query_result_buffer.data(), sizeof(PerfQueryDataType), 0);
+  if (res != VK_SUCCESS)
+    LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: ");
 
   // Remove pending queries.
   for (u32 i = 0; i < query_count; i++)
   {
-    u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
+    u32 index = (m_query_readback_pos + i) % PERF_QUERY_BUFFER_SIZE;
     ActiveQuery& entry = m_query_buffer[index];
 
     // Should have a fence associated with it (waiting for a result).
-    ASSERT(entry.fence_counter != 0);
+    DEBUG_ASSERT(entry.fence_counter != 0);
     entry.fence_counter = 0;
-    entry.available = false;
-    entry.active = false;
-
-    // Grab result from readback buffer, it will already have been invalidated.
-    u32 result;
-    m_readback_buffer->Read(index * sizeof(PerfQueryDataType), &result, sizeof(result), false);
-    DEBUG_LOG(VIDEO, "  query result %u", result);
+    entry.has_value = false;
 
     // NOTE: Reported pixel metrics should be referenced to native resolution
     m_results[entry.query_type] +=
-        static_cast<u32>(static_cast<u64>(result) * EFB_WIDTH / g_renderer->GetTargetWidth() *
-                         EFB_HEIGHT / g_renderer->GetTargetHeight());
+        static_cast<u32>(static_cast<u64>(m_query_result_buffer[i]) * EFB_WIDTH /
+                         g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());
   }
 
-  m_query_read_pos = (m_query_read_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
+  m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
   m_query_count -= query_count;
 }
 
-void PerfQuery::NonBlockingPartialFlush()
+void PerfQuery::PartialFlush(bool blocking)
 {
-  if (IsFlushed())
-    return;
-
   // Submit a command buffer in the background if the front query is not bound to one.
-  ActiveQuery& entry = m_query_buffer[m_query_read_pos];
-  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
-    Renderer::GetInstance()->ExecuteCommandBuffer(true, false);
-
-  ProcessPendingResults();
-}
-
-void PerfQuery::BlockingPartialFlush()
-{
-  if (IsFlushed())
-    return;
-
-  // If the first pending query is needing command buffer execution, do that.
-  ActiveQuery& entry = m_query_buffer[m_query_read_pos];
-  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
-    Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
+  if (blocking || m_query_buffer[m_query_readback_pos].fence_counter ==
+                      g_command_buffer_mgr->GetCurrentFenceCounter())
+  {
+    Renderer::GetInstance()->ExecuteCommandBuffer(true, blocking);
+  }
 
-  ProcessPendingResults();
+  ReadbackQueries();
 }
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
index facbe0dc0f8b..8ca91ac65875 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
@@ -13,18 +13,15 @@
 
 namespace Vulkan
 {
-class StagingBuffer;
-
 class PerfQuery : public PerfQueryBase
 {
 public:
   PerfQuery();
   ~PerfQuery();
 
-  static PerfQuery* GetInstance();
+  static PerfQuery* GetInstance() { return static_cast<PerfQuery*>(g_perf_query.get()); }
 
   bool Initialize();
-  void FlushQueries();
 
   void EnableQuery(PerfQueryGroup type) override;
   void DisableQuery(PerfQueryGroup type) override;
@@ -34,35 +31,30 @@ class PerfQuery : public PerfQueryBase
   bool IsFlushed() const override;
 
 private:
+  // u32 is used for the sample counts.
+  using PerfQueryDataType = u32;
+
+  // when testing in SMS: 64 was too small, 128 was ok
+  // TODO: This should be size_t, but the base class uses u32s
+  static const u32 PERF_QUERY_BUFFER_SIZE = 512;
+
   struct ActiveQuery
   {
     u64 fence_counter;
     PerfQueryType query_type;
-    bool available;
-    bool active;
+    bool has_value;
   };
 
   bool CreateQueryPool();
-  bool CreateReadbackBuffer();
-  void QueueCopyQueryResults(u32 start_index, u32 query_count);
-  void ProcessPendingResults();
-  void ProcessResults(u32 start_index, u32 query_count);
-
-  void NonBlockingPartialFlush();
-  void BlockingPartialFlush();
+  void ReadbackQueries();
+  void ReadbackQueries(u32 query_count);
+  void PartialFlush(bool blocking);
 
-  // when testing in SMS: 64 was too small, 128 was ok
-  // TODO: This should be size_t, but the base class uses u32s
-  using PerfQueryDataType = u32;
-  static const u32 PERF_QUERY_BUFFER_SIZE = 512;
-  std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
-  u32 m_query_read_pos = 0;
-
-  // TODO: Investigate using pipeline statistics to implement other query types
   VkQueryPool m_query_pool = VK_NULL_HANDLE;
-
-  // Buffer containing query results. Each query is a u32.
-  std::unique_ptr<StagingBuffer> m_readback_buffer;
+  u32 m_query_readback_pos = 0;
+  u32 m_query_next_pos = 0;
+  std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
+  std::array<PerfQueryDataType, PERF_QUERY_BUFFER_SIZE> m_query_result_buffer = {};
 };
 
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
index a4db679ecbbc..fc07c4ce41c1 100644
--- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp
+++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
@@ -304,7 +304,6 @@ void Renderer::PresentBackbuffer()
 {
   // End drawing to backbuffer
   StateTracker::GetInstance()->EndRenderPass();
-  PerfQuery::GetInstance()->FlushQueries();
 
   // Transition the backbuffer to PRESENT_SRC to ensure all commands drawing
   // to it have finished before present.
@@ -325,7 +324,6 @@ void Renderer::PresentBackbuffer()
 void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_completion)
 {
   StateTracker::GetInstance()->EndRenderPass();
-  PerfQuery::GetInstance()->FlushQueries();
 
   g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);