@@ -14,7 +14,6 @@

#include "VideoBackends/Vulkan/CommandBufferManager.h"
#include "VideoBackends/Vulkan/Renderer.h"
#include "VideoBackends/Vulkan/StagingBuffer.h"
#include "VideoBackends/Vulkan/StateTracker.h"
#include "VideoBackends/Vulkan/VulkanContext.h"

@@ -24,17 +23,10 @@ PerfQuery::PerfQuery() = default;

PerfQuery::~PerfQuery()
{
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);

if (m_query_pool != VK_NULL_HANDLE)
vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr);
}

Vulkan::PerfQuery* PerfQuery::GetInstance()
{
return static_cast<PerfQuery*>(g_perf_query.get());
}

bool PerfQuery::Initialize()
{
if (!CreateQueryPool())
@@ -43,110 +35,77 @@ bool PerfQuery::Initialize()
return false;
}

if (!CreateReadbackBuffer())
{
PanicAlert("Failed to create readback buffer");
return false;
}

g_command_buffer_mgr->AddFenceSignaledCallback(
this, std::bind(&PerfQuery::OnFenceSignaled, this, std::placeholders::_1));

return true;
}

void PerfQuery::EnableQuery(PerfQueryGroup type)
{
// Have we used half of the query buffer already?
if (m_query_count > m_query_buffer.size() / 2)
NonBlockingPartialFlush();

// Block if there are no free slots.
if (m_query_count == PERF_QUERY_BUFFER_SIZE)
{
// ERROR_LOG(VIDEO, "Flushed query buffer early!");
BlockingPartialFlush();
}
// Otherwise, try to keep half of them available.
if (m_query_count > m_query_buffer.size() / 2)
PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE);

if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
u32 index = (m_query_read_pos + m_query_count) % PERF_QUERY_BUFFER_SIZE;
ActiveQuery& entry = m_query_buffer[index];
ASSERT(!entry.active && !entry.available);
entry.active = true;
m_query_count++;

DEBUG_LOG(VIDEO, "start query %u", index);
ActiveQuery& entry = m_query_buffer[m_query_next_pos];
DEBUG_ASSERT(!entry.has_value);
entry.has_value = true;

// Use precise queries if supported, otherwise boolean (which will be incorrect).
VkQueryControlFlags flags = 0;
if (g_vulkan_context->SupportsPreciseOcclusionQueries())
flags = VK_QUERY_CONTROL_PRECISE_BIT;
VkQueryControlFlags flags =
g_vulkan_context->SupportsPreciseOcclusionQueries() ? VK_QUERY_CONTROL_PRECISE_BIT : 0;

// Ensure the query starts within a render pass.
// TODO: Is this needed?
StateTracker::GetInstance()->BeginRenderPass();
vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index, flags);
vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos,
flags);
}
}

void PerfQuery::DisableQuery(PerfQueryGroup type)
{
if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
{
// DisableQuery should be called for each EnableQuery, so subtract one to get the previous one.
u32 index = (m_query_read_pos + m_query_count - 1) % PERF_QUERY_BUFFER_SIZE;
vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index);
vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos);
m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE;
m_query_count++;
}
}

void PerfQuery::ResetQuery()
{
m_query_count = 0;
m_query_read_pos = 0;
m_query_readback_pos = 0;
m_query_next_pos = 0;
std::fill_n(m_results, ArraySize(m_results), 0);

// Reset entire query pool, ensuring all queries are ready to write to.
StateTracker::GetInstance()->EndRenderPass();
vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, 0,
PERF_QUERY_BUFFER_SIZE);

for (auto& entry : m_query_buffer)
{
entry.pending_fence = VK_NULL_HANDLE;
entry.available = false;
entry.active = false;
}
std::memset(m_query_buffer.data(), 0, sizeof(ActiveQuery) * m_query_buffer.size());
}

u32 PerfQuery::GetQueryResult(PerfQueryType type)
{
u32 result = 0;

if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
{
result = m_results[PQG_ZCOMP_ZCOMPLOC];
}
else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
{
result = m_results[PQG_ZCOMP];
}
else if (type == PQ_BLEND_INPUT)
{
result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
}
else if (type == PQ_EFB_COPY_CLOCKS)
{
result = m_results[PQG_EFB_COPY_CLOCKS];
}

return result / 4;
}

void PerfQuery::FlushResults()
{
while (!IsFlushed())
BlockingPartialFlush();
PartialFlush(true);
}

bool PerfQuery::IsFlushed() const
@@ -175,194 +134,79 @@ bool PerfQuery::CreateQueryPool()
return true;
}

bool PerfQuery::CreateReadbackBuffer()
{
m_readback_buffer = StagingBuffer::Create(STAGING_BUFFER_TYPE_READBACK,
PERF_QUERY_BUFFER_SIZE * sizeof(PerfQueryDataType),
VK_BUFFER_USAGE_TRANSFER_DST_BIT);

// Leave the buffer persistently mapped, we invalidate it when we need to read.
if (!m_readback_buffer || !m_readback_buffer->Map())
return false;

return true;
}

void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count)
void PerfQuery::ReadbackQueries()
{
DEBUG_LOG(VIDEO, "queue copy of queries %u-%u", start_index, start_index + query_count - 1);
const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter();

// Transition buffer for GPU write
// TODO: Is this needed?
m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(),
VK_ACCESS_TRANSFER_WRITE_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT);

// Copy from queries -> buffer
vkCmdCopyQueryPoolResults(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool,
start_index, query_count, m_readback_buffer->GetBuffer(),
start_index * sizeof(PerfQueryDataType), sizeof(PerfQueryDataType),
VK_QUERY_RESULT_WAIT_BIT);

// Prepare for host readback
m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(),
VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);

// Reset queries so they're ready to use again
vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, start_index,
query_count);

// Flag all queries as available, but with a fence that has to be completed first
for (u32 i = 0; i < query_count; i++)
{
u32 index = start_index + i;
ActiveQuery& entry = m_query_buffer[index];
entry.pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
entry.available = true;
entry.active = false;
}
}

void PerfQuery::FlushQueries()
{
// Flag all pending queries that aren't available as available after execution.
u32 copy_start_index = 0;
u32 copy_count = 0;
for (u32 i = 0; i < m_query_count; i++)
{
u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
ActiveQuery& entry = m_query_buffer[index];

// Skip already-copied queries (will happen if a flush hasn't occurred and
// a command buffer hasn't finished executing).
if (entry.available)
{
// These should be grouped together, and at the start.
ASSERT(copy_count == 0);
continue;
}

// If this wrapped around, we need to flush the entries before the end of the buffer.
ASSERT(entry.active);
if (index < copy_start_index)
{
QueueCopyQueryResults(copy_start_index, copy_count);
copy_start_index = index;
copy_count = 0;
}
else if (copy_count == 0)
{
copy_start_index = index;
}
copy_count++;
}

if (copy_count > 0)
QueueCopyQueryResults(copy_start_index, copy_count);
}

void PerfQuery::OnFenceSignaled(VkFence fence)
{
// Need to save these since ProcessResults will modify them.
u32 query_read_pos = m_query_read_pos;
u32 query_count = m_query_count;

// Flush as many queries as are bound to this fence.
u32 flush_start_index = 0;
u32 flush_count = 0;
for (u32 i = 0; i < query_count; i++)
const u32 outstanding_queries = m_query_count;
u32 readback_count = 0;
for (u32 i = 0; i < outstanding_queries; i++)
{
u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
if (m_query_buffer[index].pending_fence != fence)
{
// These should be grouped together, at the end.
u32 index = (m_query_readback_pos + readback_count) % PERF_QUERY_BUFFER_SIZE;
const ActiveQuery& entry = m_query_buffer[index];
if (entry.fence_counter > completed_fence_counter)
break;
}

// If this wrapped around, we need to flush the entries before the end of the buffer.
if (index < flush_start_index)
{
ProcessResults(flush_start_index, flush_count);
flush_start_index = index;
flush_count = 0;
}
else if (flush_count == 0)
if (index < m_query_readback_pos)
{
flush_start_index = index;
ReadbackQueries(readback_count);
DEBUG_ASSERT(m_query_readback_pos == 0);
readback_count = 0;
}
flush_count++;

readback_count++;
}

if (flush_count > 0)
ProcessResults(flush_start_index, flush_count);
if (readback_count > 0)
ReadbackQueries(readback_count);
}

void PerfQuery::ProcessResults(u32 start_index, u32 query_count)
void PerfQuery::ReadbackQueries(u32 query_count)
{
// Invalidate CPU caches before reading back.
m_readback_buffer->InvalidateCPUCache(start_index * sizeof(PerfQueryDataType),
query_count * sizeof(PerfQueryDataType));

// Should be at maximum query_count queries pending.
ASSERT(query_count <= m_query_count);
DEBUG_LOG(VIDEO, "process queries %u-%u", start_index, start_index + query_count - 1);
ASSERT(query_count <= m_query_count &&
(m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE);

// Read back from the GPU.
VkResult res =
vkGetQueryPoolResults(g_vulkan_context->GetDevice(), m_query_pool, m_query_readback_pos,
query_count, query_count * sizeof(PerfQueryDataType),
m_query_result_buffer.data(), sizeof(PerfQueryDataType), 0);
if (res != VK_SUCCESS)
LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: ");

// Remove pending queries.
for (u32 i = 0; i < query_count; i++)
{
u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
u32 index = (m_query_readback_pos + i) % PERF_QUERY_BUFFER_SIZE;
ActiveQuery& entry = m_query_buffer[index];

// Should have a fence associated with it (waiting for a result).
ASSERT(entry.pending_fence != VK_NULL_HANDLE);
entry.pending_fence = VK_NULL_HANDLE;
entry.available = false;
entry.active = false;

// Grab result from readback buffer, it will already have been invalidated.
u32 result;
m_readback_buffer->Read(index * sizeof(PerfQueryDataType), &result, sizeof(result), false);
DEBUG_LOG(VIDEO, " query result %u", result);
DEBUG_ASSERT(entry.fence_counter != 0);
entry.fence_counter = 0;
entry.has_value = false;

// NOTE: Reported pixel metrics should be referenced to native resolution
m_results[entry.query_type] +=
static_cast<u32>(static_cast<u64>(result) * EFB_WIDTH / g_renderer->GetTargetWidth() *
EFB_HEIGHT / g_renderer->GetTargetHeight());
static_cast<u32>(static_cast<u64>(m_query_result_buffer[i]) * EFB_WIDTH /
g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());
}

m_query_read_pos = (m_query_read_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
m_query_count -= query_count;
}

void PerfQuery::NonBlockingPartialFlush()
void PerfQuery::PartialFlush(bool blocking)
{
if (IsFlushed())
return;

// Submit a command buffer in the background if the front query is not bound to one.
// Ideally this will complete before the buffer fills.
if (m_query_buffer[m_query_read_pos].pending_fence == VK_NULL_HANDLE)
Renderer::GetInstance()->ExecuteCommandBuffer(true, false);
}

void PerfQuery::BlockingPartialFlush()
{
if (IsFlushed())
return;

// If the first pending query is needing command buffer execution, do that.
ActiveQuery& entry = m_query_buffer[m_query_read_pos];
if (entry.pending_fence == VK_NULL_HANDLE)
if (blocking || m_query_buffer[m_query_readback_pos].fence_counter ==
g_command_buffer_mgr->GetCurrentFenceCounter())
{
// This will callback OnCommandBufferQueued which will set the fence on the entry.
// We wait for completion, which will also call OnCommandBufferExecuted, and clear the fence.
Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
}
else
{
// The command buffer has been submitted, but is awaiting completion.
// Wait for the fence to complete, which will call OnCommandBufferExecuted.
g_command_buffer_mgr->WaitForFence(entry.pending_fence);
Renderer::GetInstance()->ExecuteCommandBuffer(true, blocking);
}

ReadbackQueries();
}
} // namespace Vulkan
@@ -13,18 +13,15 @@

namespace Vulkan
{
class StagingBuffer;

class PerfQuery : public PerfQueryBase
{
public:
PerfQuery();
~PerfQuery();

static PerfQuery* GetInstance();
static PerfQuery* GetInstance() { return static_cast<PerfQuery*>(g_perf_query.get()); }

bool Initialize();
void FlushQueries();

void EnableQuery(PerfQueryGroup type) override;
void DisableQuery(PerfQueryGroup type) override;
@@ -34,37 +31,30 @@ class PerfQuery : public PerfQueryBase
bool IsFlushed() const override;

private:
// u32 is used for the sample counts.
using PerfQueryDataType = u32;

// when testing in SMS: 64 was too small, 128 was ok
// TODO: This should be size_t, but the base class uses u32s
static const u32 PERF_QUERY_BUFFER_SIZE = 512;

struct ActiveQuery
{
u64 fence_counter;
PerfQueryType query_type;
VkFence pending_fence;
bool available;
bool active;
bool has_value;
};

bool CreateQueryPool();
bool CreateReadbackBuffer();
void QueueCopyQueryResults(u32 start_index, u32 query_count);
void ProcessResults(u32 start_index, u32 query_count);

void OnCommandBufferQueued(VkCommandBuffer command_buffer, VkFence fence);
void OnFenceSignaled(VkFence fence);

void NonBlockingPartialFlush();
void BlockingPartialFlush();
void ReadbackQueries();
void ReadbackQueries(u32 query_count);
void PartialFlush(bool blocking);

// when testing in SMS: 64 was too small, 128 was ok
// TODO: This should be size_t, but the base class uses u32s
using PerfQueryDataType = u32;
static const u32 PERF_QUERY_BUFFER_SIZE = 512;
std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
u32 m_query_read_pos = 0;

// TODO: Investigate using pipeline statistics to implement other query types
VkQueryPool m_query_pool = VK_NULL_HANDLE;

// Buffer containing query results. Each query is a u32.
std::unique_ptr<StagingBuffer> m_readback_buffer;
u32 m_query_readback_pos = 0;
u32 m_query_next_pos = 0;
std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
std::array<PerfQueryDataType, PERF_QUERY_BUFFER_SIZE> m_query_result_buffer = {};
};

} // namespace Vulkan
@@ -304,7 +304,6 @@ void Renderer::PresentBackbuffer()
{
// End drawing to backbuffer
StateTracker::GetInstance()->EndRenderPass();
PerfQuery::GetInstance()->FlushQueries();

// Transition the backbuffer to PRESENT_SRC to ensure all commands drawing
// to it have finished before present.
@@ -315,7 +314,7 @@ void Renderer::PresentBackbuffer()
// Because this final command buffer is rendering to the swap chain, we need to wait for
// the available semaphore to be signaled before executing the buffer. This final submission
// can happen off-thread in the background while we're preparing the next frame.
g_command_buffer_mgr->SubmitCommandBuffer(true, m_swap_chain->GetSwapChain(),
g_command_buffer_mgr->SubmitCommandBuffer(true, false, m_swap_chain->GetSwapChain(),
m_swap_chain->GetCurrentImageIndex());

// New cmdbuffer, so invalidate state.
@@ -325,13 +324,8 @@ void Renderer::PresentBackbuffer()
void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_completion)
{
StateTracker::GetInstance()->EndRenderPass();
PerfQuery::GetInstance()->FlushQueries();

// If we're waiting for completion, don't bother waking the worker thread.
const VkFence pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread && wait_for_completion);
if (wait_for_completion)
g_command_buffer_mgr->WaitForFence(pending_fence);
g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);

StateTracker::GetInstance()->InvalidateCachedState();
}
@@ -550,10 +544,6 @@ void Renderer::UnbindTexture(const AbstractTexture* texture)

void Renderer::ResetSamplerStates()
{
// Ensure none of the sampler objects are in use.
// This assumes that none of the samplers are in use on the command list currently being recorded.
g_command_buffer_mgr->WaitForGPUIdle();

// Invalidate all sampler states, next draw will re-initialize them.
for (u32 i = 0; i < m_sampler_states.size(); i++)
{
@@ -62,11 +62,13 @@ bool StateTracker::Initialize()
VKTexture::Create(TextureConfig(1, 1, 1, 1, 1, AbstractTextureFormat::RGBA8, 0));
if (!m_dummy_texture)
return false;
m_dummy_texture->TransitionToLayout(g_command_buffer_mgr->GetCurrentInitCommandBuffer(),
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);

// Initialize all samplers to point by default
for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++)
{
m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
m_bindings.samplers[i].imageView = m_dummy_texture->GetView();
m_bindings.samplers[i].sampler = g_object_cache->GetPointSampler();
}
@@ -223,14 +225,14 @@ void StateTracker::UnbindTexture(VkImageView view)
if (it.imageView == view)
{
it.imageView = m_dummy_texture->GetView();
it.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
it.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
}
}

if (m_bindings.image_texture.imageView == view)
{
m_bindings.image_texture.imageView = m_dummy_texture->GetView();
m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
}
}

@@ -19,14 +19,10 @@ namespace Vulkan
{
StreamBuffer::StreamBuffer(VkBufferUsageFlags usage, u32 size) : m_usage(usage), m_size(size)
{
g_command_buffer_mgr->AddFenceSignaledCallback(
this, std::bind(&StreamBuffer::OnFenceSignaled, this, std::placeholders::_1));
}

StreamBuffer::~StreamBuffer()
{
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);

if (m_host_pointer)
vkUnmapMemory(g_vulkan_context->GetDevice(), m_memory);

@@ -189,8 +185,6 @@ bool StreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
// Can we find a fence to wait on that will give us enough memory?
if (WaitForClearSpace(required_bytes))
{
ASSERT(m_current_offset == m_current_gpu_position ||
(m_current_offset + required_bytes) < m_current_gpu_position);
m_current_offset = Common::AlignUp(m_current_offset, alignment);
m_last_allocation_size = num_bytes;
return true;
@@ -225,36 +219,40 @@ void StreamBuffer::UpdateCurrentFencePosition()
return;

// Has the offset changed since the last fence?
const VkFence fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
if (!m_tracked_fences.empty() && m_tracked_fences.back().first == fence)
const u64 counter = g_command_buffer_mgr->GetCurrentFenceCounter();
if (!m_tracked_fences.empty() && m_tracked_fences.back().first == counter)
{
// Still haven't executed a command buffer, so just update the offset.
m_tracked_fences.back().second = m_current_offset;
return;
}

m_tracked_fences.emplace_back(fence, m_current_offset);
// New buffer, so update the GPU position while we're at it.
UpdateGPUPosition();
m_tracked_fences.emplace_back(counter, m_current_offset);
}

void StreamBuffer::OnFenceSignaled(VkFence fence)
void StreamBuffer::UpdateGPUPosition()
{
// Locate the entry for this fence (if any, we may have been forced to wait already)
auto iter = std::find_if(m_tracked_fences.begin(), m_tracked_fences.end(),
[fence](const auto& it) { return it.first == fence; });
auto start = m_tracked_fences.begin();
auto end = start;

if (iter != m_tracked_fences.end())
const u64 completed_counter = g_command_buffer_mgr->GetCompletedFenceCounter();
while (end != m_tracked_fences.end() && completed_counter >= end->first)
{
// Update the GPU position, and remove any fences before this fence (since
// it is implied that they have been signaled as well, though the callback
// should have removed them already).
m_current_gpu_position = iter->second;
m_tracked_fences.erase(m_tracked_fences.begin(), ++iter);
m_current_gpu_position = end->second;
++end;
}

if (start != end)
m_tracked_fences.erase(start, end);
}

bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
{
u32 new_offset = 0;
u32 new_gpu_position = 0;

auto iter = m_tracked_fences.begin();
for (; iter != m_tracked_fences.end(); iter++)
{
@@ -265,20 +263,32 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
u32 gpu_position = iter->second;
if (m_current_offset == gpu_position)
{
// Start at the start of the buffer again.
new_offset = 0;
new_gpu_position = 0;
break;
}

// Assuming that we wait for this fence, are we allocating in front of the GPU?
if (m_current_offset > gpu_position)
{
// This would suggest the GPU has now followed us and wrapped around, so we have from
// m_current_position..m_size free, as well as and 0..gpu_position.
const u32 remaining_space_after_offset = m_size - m_current_offset;
if (remaining_space_after_offset >= num_bytes)
{
// Switch to allocating in front of the GPU, using the remainder of the buffer.
new_offset = m_current_offset;
new_gpu_position = gpu_position;
break;
}

// We can wrap around to the start, behind the GPU, if there is enough space.
// We use > here because otherwise we'd end up lining up with the GPU, and then the
// allocator would assume that the GPU has consumed what we just wrote.
if (gpu_position > num_bytes)
{
new_offset = 0;
new_gpu_position = gpu_position;
break;
}
}
@@ -292,6 +302,7 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
{
// Leave the offset as-is, but update the GPU position.
new_offset = m_current_offset;
new_gpu_position = gpu_position;
break;
}
}
@@ -300,14 +311,17 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
// Did any fences satisfy this condition?
// Has the command buffer been executed yet? If not, the caller should execute it.
if (iter == m_tracked_fences.end() ||
iter->first == g_command_buffer_mgr->GetCurrentCommandBufferFence())
iter->first == g_command_buffer_mgr->GetCurrentFenceCounter())
{
return false;
}

// Wait until this fence is signaled. This will fire the callback, updating the GPU position.
g_command_buffer_mgr->WaitForFence(iter->first);
g_command_buffer_mgr->WaitForFenceCounter(iter->first);
m_tracked_fences.erase(m_tracked_fences.begin(),
m_current_offset == iter->second ? m_tracked_fences.end() : ++iter);
m_current_offset = new_offset;
m_current_gpu_position = new_gpu_position;
return true;
}

@@ -34,7 +34,7 @@ class StreamBuffer
private:
bool AllocateBuffer();
void UpdateCurrentFencePosition();
void OnFenceSignaled(VkFence fence);
void UpdateGPUPosition();

// Waits for as many fences as needed to allocate num_bytes bytes from the buffer.
bool WaitForClearSpace(u32 num_bytes);
@@ -50,7 +50,7 @@ class StreamBuffer
u8* m_host_pointer = nullptr;

// List of fences and the corresponding positions in the buffer
std::deque<std::pair<VkFence, u32>> m_tracked_fences;
std::deque<std::pair<u64, u32>> m_tracked_fences;

bool m_coherent_mapping = false;
};
@@ -674,11 +674,7 @@ VKStagingTexture::VKStagingTexture(StagingTextureType type, const TextureConfig&
{
}

VKStagingTexture::~VKStagingTexture()
{
if (m_needs_flush)
VKStagingTexture::Flush();
}
VKStagingTexture::~VKStagingTexture() = default;

std::unique_ptr<VKStagingTexture> VKStagingTexture::Create(StagingTextureType type,
const TextureConfig& config)
@@ -739,14 +735,6 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src,
ASSERT(dst_rect.left >= 0 && static_cast<u32>(dst_rect.right) <= m_config.width &&
dst_rect.top >= 0 && static_cast<u32>(dst_rect.bottom) <= m_config.height);

if (m_needs_flush)
{
// Drop copy before reusing it.
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
m_flush_fence = VK_NULL_HANDLE;
m_needs_flush = false;
}

StateTracker::GetInstance()->EndRenderPass();

VkImageLayout old_layout = src_tex->GetLayout();
@@ -773,16 +761,7 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src,
src_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout);

m_needs_flush = true;
m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) {
if (m_flush_fence != fence)
return;

m_flush_fence = VK_NULL_HANDLE;
m_needs_flush = false;
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
m_staging_buffer->InvalidateCPUCache();
});
m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
}

void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, AbstractTexture* dst,
@@ -798,14 +777,6 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, A
ASSERT(dst_rect.left >= 0 && static_cast<u32>(dst_rect.right) <= dst_tex->GetWidth() &&
dst_rect.top >= 0 && static_cast<u32>(dst_rect.bottom) <= dst_tex->GetHeight());

if (m_needs_flush)
{
// Drop copy before reusing it.
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
m_flush_fence = VK_NULL_HANDLE;
m_needs_flush = false;
}

// Flush caches before copying.
m_staging_buffer->FlushCPUCache();
StateTracker::GetInstance()->EndRenderPass();
@@ -833,15 +804,7 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, A
dst_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout);

m_needs_flush = true;
m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) {
if (m_flush_fence != fence)
return;

m_flush_fence = VK_NULL_HANDLE;
m_needs_flush = false;
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
});
m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
}

bool VKStagingTexture::Map()
@@ -860,25 +823,23 @@ void VKStagingTexture::Flush()
if (!m_needs_flush)
return;

// Either of the below two calls will cause the callback to fire.
g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
if (m_flush_fence == g_command_buffer_mgr->GetCurrentCommandBufferFence())
// Is this copy in the current command buffer?
if (g_command_buffer_mgr->GetCurrentFenceCounter() == m_flush_fence_counter)
{
// The readback is in the current command buffer, and we must execute it.
// Execute the command buffer and wait for it to finish.
Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
}
else
{
// WaitForFence should fire the callback.
g_command_buffer_mgr->WaitForFence(m_flush_fence);
// Wait for the GPU to finish with it.
g_command_buffer_mgr->WaitForFenceCounter(m_flush_fence_counter);
}

DEBUG_ASSERT(m_flush_fence == VK_NULL_HANDLE);
m_needs_flush = false;

// For readback textures, invalidate the CPU cache as there is new data there.
if (m_type == StagingTextureType::Readback || m_type == StagingTextureType::Mutable)
m_staging_buffer->InvalidateCPUCache();

m_needs_flush = false;
}

VKFramebuffer::VKFramebuffer(VKTexture* color_attachment, VKTexture* depth_attachment, u32 width,
@@ -104,7 +104,7 @@ class VKStagingTexture final : public AbstractStagingTexture
std::unique_ptr<StagingBuffer> buffer);

std::unique_ptr<StagingBuffer> m_staging_buffer;
VkFence m_flush_fence = VK_NULL_HANDLE;
u64 m_flush_fence_counter = 0;
};

class VKFramebuffer final : public AbstractFramebuffer
@@ -60,11 +60,11 @@ VertexManager::~VertexManager()
bool VertexManager::Initialize()
{
m_vertex_stream_buffer =
StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE * 4);
StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE);
m_index_stream_buffer =
StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE * 4);
StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE);
m_uniform_stream_buffer =
StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE * 4);
StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE);
if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_uniform_stream_buffer)
{
PanicAlert("Failed to allocate streaming buffers");
@@ -251,8 +251,8 @@ bool VideoBackend::Initialize(const WindowSystemInfo& wsi)

void VideoBackend::Shutdown()
{
if (g_command_buffer_mgr)
g_command_buffer_mgr->WaitForGPUIdle();
if (g_vulkan_context)
vkDeviceWaitIdle(g_vulkan_context->GetDevice());

if (g_shader_cache)
g_shader_cache->Shutdown();
@@ -96,6 +96,9 @@ TextureCacheBase::TextureCacheBase()

TextureCacheBase::~TextureCacheBase()
{
// Clear pending EFB copies first, so we don't try to flush them.
m_pending_efb_copies.clear();

HiresTexture::Shutdown();
Invalidate();
Common::FreeAlignedMemory(temp);