Skip to content
Permalink
Browse files

Upload using the copy queue in vulkan, which should utilize PCIe DMA …

…transfers. Unfortunately it doesn't seem to be faster.
  • Loading branch information
dpjudas committed Nov 23, 2019
1 parent bff22bb commit 70842720d2ebb401c947a71575c72f73bd8326ad
@@ -309,7 +309,7 @@ class PipelineBarrier
void addImage(VulkanImage *image, VkImageLayout oldLayout, VkImageLayout newLayout, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addImage(VkImage image, VkImageLayout oldLayout, VkImageLayout newLayout, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanBuffer *buffer, VkAccessFlags srcAccessMask, VkAccessFlags dstAccessMask);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout layout, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);
void addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout oldlayout, VkImageLayout newlayout, VkImageAspectFlags aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, int baseMipLevel = 0, int levelCount = 1);

void execute(VulkanCommandBuffer *commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask, VkDependencyFlags dependencyFlags = 0);

@@ -1234,12 +1234,12 @@ inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, Vulk
bufferMemoryBarriers.push_back(barrier);
}

inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout layout, VkImageAspectFlags aspectMask, int baseMipLevel, int levelCount)
inline void PipelineBarrier::addQueueTransfer(int srcFamily, int dstFamily, VulkanImage *image, VkImageLayout oldlayout, VkImageLayout newlayout, VkImageAspectFlags aspectMask, int baseMipLevel, int levelCount)
{
VkImageMemoryBarrier barrier = { };
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = layout;
barrier.newLayout = layout;
barrier.oldLayout = oldlayout;
barrier.newLayout = newlayout;
barrier.srcQueueFamilyIndex = srcFamily;
barrier.dstQueueFamilyIndex = dstFamily;
barrier.image = image->image;
@@ -163,6 +163,18 @@ void VulkanDevice::SelectPhysicalDevice()
}
}

// Search for a transfer family made specifically for uploading. For nvidia this allows us to upload using DMA transfers via PCIe.
// To identify it, we look for a transfer family that must not have graphics or compute capabilities.
for (int i = 0; i < (int)info.QueueFamilies.size(); i++)
{
const auto& queueFamily = info.QueueFamilies[i];
if (queueFamily.queueCount > 0 && (queueFamily.queueFlags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)) == 0 && (queueFamily.queueFlags & VK_QUEUE_TRANSFER_BIT))
{
dev.copyQueueTransferFamily = i;
break;
}
}

if (dev.graphicsFamily != -1 && dev.presentFamily != -1)
{
SupportedDevices.push_back(dev);
@@ -206,6 +218,7 @@ void VulkanDevice::SelectPhysicalDevice()
PhysicalDevice = *SupportedDevices[selected].device;
graphicsFamily = SupportedDevices[selected].graphicsFamily;
presentFamily = SupportedDevices[selected].presentFamily;
copyQueueTransferFamily = SupportedDevices[selected].copyQueueTransferFamily;
graphicsTimeQueries = SupportedDevices[selected].graphicsTimeQueries;
}

@@ -234,6 +247,8 @@ void VulkanDevice::CreateDevice()
std::set<int> neededFamilies;
neededFamilies.insert(graphicsFamily);
neededFamilies.insert(presentFamily);
if (copyQueueTransferFamily != -1)
neededFamilies.insert(copyQueueTransferFamily);

for (int index : neededFamilies)
{
@@ -261,6 +276,9 @@ void VulkanDevice::CreateDevice()

vkGetDeviceQueue(device, graphicsFamily, 0, &graphicsQueue);
vkGetDeviceQueue(device, presentFamily, 0, &presentQueue);

if (copyQueueTransferFamily != -1)
vkGetDeviceQueue(device, copyQueueTransferFamily, 0, &copyQueue);
}

void VulkanDevice::CreateSurface()
@@ -30,6 +30,7 @@ class VulkanCompatibleDevice
VulkanPhysicalDevice *device = nullptr;
int graphicsFamily = -1;
int presentFamily = -1;
int copyQueueTransferFamily = -1;
bool graphicsTimeQueries = false;
};

@@ -72,11 +73,13 @@ class VulkanDevice
VkDevice device = VK_NULL_HANDLE;
VmaAllocator allocator = VK_NULL_HANDLE;

VkQueue copyQueue = VK_NULL_HANDLE;
VkQueue graphicsQueue = VK_NULL_HANDLE;
VkQueue presentQueue = VK_NULL_HANDLE;

int graphicsFamily = -1;
int presentFamily = -1;
int copyQueueTransferFamily = -1;
bool graphicsTimeQueries = false;

private:
@@ -146,6 +146,8 @@ void VulkanFrameBuffer::InitializeState()
maxuniformblock = device->PhysicalDevice.Properties.limits.maxUniformBufferRange;

mCommandPool.reset(new VulkanCommandPool(device, device->graphicsFamily));
if (device->copyQueueTransferFamily != -1)
mCopyQueueCommandPool.reset(new VulkanCommandPool(device, device->copyQueueTransferFamily));

mScreenBuffers.reset(new VkRenderBuffers());
mSaveBuffers.reset(new VkRenderBuffers());
@@ -218,7 +220,7 @@ void VulkanFrameBuffer::DeleteFrameObjects()
FrameDeleteList.CommandBuffers.clear();
}

void VulkanFrameBuffer::FlushCommands(VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit)
void VulkanFrameBuffer::FlushCommands(VkQueue queue, VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit)
{
int currentIndex = mNextSubmit % maxConcurrentSubmitCount;

@@ -245,14 +247,23 @@ void VulkanFrameBuffer::FlushCommands(VulkanCommandBuffer **commands, size_t cou
if (!lastsubmit)
submit.addSignal(mSubmitSemaphore[currentIndex].get());

submit.execute(device, device->graphicsQueue, mSubmitFence[currentIndex].get());
submit.execute(device, queue, mSubmitFence[currentIndex].get());
mNextSubmit++;
}

void VulkanFrameBuffer::FlushCommands(bool finish, bool lastsubmit)
{
mRenderState->EndRenderPass();

if (mCopyQueueCommands)
{
mCopyQueueCommands->end();
VulkanCommandBuffer* command = mCopyQueueCommands.get();
FrameDeleteList.CommandBuffers.push_back(std::move(mCopyQueueCommands));

FlushCommands(device->copyQueue, &command, 1, false, false);
}

if (mDrawCommands || mTransferCommands)
{
VulkanCommandBuffer *commands[2];
@@ -272,7 +283,7 @@ void VulkanFrameBuffer::FlushCommands(bool finish, bool lastsubmit)
FrameDeleteList.CommandBuffers.push_back(std::move(mDrawCommands));
}

FlushCommands(commands, count, finish, lastsubmit);
FlushCommands(device->graphicsQueue, commands, count, finish, lastsubmit);

current_rendered_commandbuffers += (int)count;
}
@@ -906,6 +917,17 @@ void VulkanFrameBuffer::Draw2D()
::Draw2D(&m2DDrawer, *mRenderState);
}

VulkanCommandBuffer *VulkanFrameBuffer::GetCopyQueueCommands()
{
if (!mCopyQueueCommands)
{
mCopyQueueCommands = mCopyQueueCommandPool->createBuffer();
mCopyQueueCommands->SetDebugName("VulkanFrameBuffer.mCopyQueueCommands");
mCopyQueueCommands->begin();
}
return mCopyQueueCommands.get();
}

VulkanCommandBuffer *VulkanFrameBuffer::GetTransferCommands()
{
if (!mTransferCommands)
@@ -27,6 +27,7 @@ class VulkanFrameBuffer : public SystemBaseFrameBuffer
uint32_t presentImageIndex = 0xffffffff;
bool cur_vsync;

VulkanCommandBuffer *GetCopyQueueCommands();
VulkanCommandBuffer *GetTransferCommands();
VulkanCommandBuffer *GetDrawCommands();
VkShaderManager *GetShaderManager() { return mShaderManager.get(); }
@@ -118,7 +119,7 @@ class VulkanFrameBuffer : public SystemBaseFrameBuffer
void CopyScreenToBuffer(int w, int h, void *data);
void UpdateShadowMap();
void DeleteFrameObjects();
void FlushCommands(VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit);
void FlushCommands(VkQueue queue, VulkanCommandBuffer **commands, size_t count, bool finish, bool lastsubmit);

std::unique_ptr<VkShaderManager> mShaderManager;
std::unique_ptr<VkSamplerManager> mSamplerManager;
@@ -127,6 +128,8 @@ class VulkanFrameBuffer : public SystemBaseFrameBuffer
std::unique_ptr<VkPostprocess> mPostprocess;
std::unique_ptr<VkRenderPassManager> mRenderPassManager;
std::unique_ptr<VulkanCommandPool> mCommandPool;
std::unique_ptr<VulkanCommandPool> mCopyQueueCommandPool;
std::unique_ptr<VulkanCommandBuffer> mCopyQueueCommands;
std::unique_ptr<VulkanCommandBuffer> mTransferCommands;
std::unique_ptr<VkRenderState> mRenderState;

@@ -71,13 +71,18 @@ void VkHardwareTexture::Reset()

if (mappedSWFB)
{
mImage.Image->Unmap();
if (mTransferBuffer)
mTransferBuffer->Unmap();
else
mImage.Image->Unmap();
mappedSWFB = nullptr;
}

auto &deleteList = fb->FrameDeleteList;
if (mImage.Image) deleteList.Images.push_back(std::move(mImage.Image));
if (mImage.View) deleteList.ImageViews.push_back(std::move(mImage.View));
if (mTransferImage) deleteList.Images.push_back(std::move(mTransferImage));
if (mTransferBuffer) deleteList.Buffers.push_back(std::move(mTransferBuffer));
for (auto &it : mImage.RSFramebuffers) deleteList.Framebuffers.push_back(std::move(it.second));
if (mDepthStencil.Image) deleteList.Images.push_back(std::move(mDepthStencil.Image));
if (mDepthStencil.View) deleteList.ImageViews.push_back(std::move(mDepthStencil.View));
@@ -324,17 +329,52 @@ void VkHardwareTexture::AllocateBuffer(int w, int h, int texelsize)

VkFormat format = texelsize == 4 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R8_UNORM;

ImageBuilder imgbuilder;
VkDeviceSize allocatedBytes = 0;
imgbuilder.setFormat(format);
imgbuilder.setSize(w, h);
imgbuilder.setLinearTiling();
imgbuilder.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT, VMA_MEMORY_USAGE_UNKNOWN, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
imgbuilder.setMemoryType(
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
mImage.Image = imgbuilder.create(fb->device, &allocatedBytes);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");
if (fb->device->copyQueueTransferFamily != -1)
{
// Use DMA transfer to get the image to the GPU

BufferBuilder bufbuilder;
bufbuilder.setUsage(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VMA_MEMORY_USAGE_CPU_ONLY, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
bufbuilder.setSize(w * h * texelsize);
mTransferBuffer = bufbuilder.create(fb->device);
mTransferBuffer->SetDebugName("VkHardwareTexture.mTransferBuffer");

ImageBuilder imgbuilder0;
imgbuilder0.setFormat(format);
imgbuilder0.setSize(w, h);
imgbuilder0.setUsage(VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
mTransferImage = imgbuilder0.create(fb->device);
mTransferImage->SetDebugName("VkHardwareTexture.mTransferImage");

ImageBuilder imgbuilder1;
imgbuilder1.setFormat(format);
imgbuilder1.setSize(w, h);
imgbuilder1.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
mImage.Image = imgbuilder1.create(fb->device);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");

bufferpitch = w;
}
else
{
// Memory map the image directly for GPUs where we have no transfer queue (i.e. Intel embedded GPUs)

ImageBuilder imgbuilder;
imgbuilder.setFormat(format);
imgbuilder.setSize(w, h);
imgbuilder.setLinearTiling();
imgbuilder.setUsage(VK_IMAGE_USAGE_SAMPLED_BIT, VMA_MEMORY_USAGE_UNKNOWN, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
imgbuilder.setMemoryType(
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

VkDeviceSize allocatedBytes = 0;
mImage.Image = imgbuilder.create(fb->device, &allocatedBytes);
mImage.Image->SetDebugName("VkHardwareTexture.mImage");

bufferpitch = int(allocatedBytes / h / texelsize);
}

mTexelsize = texelsize;

ImageViewBuilder viewbuilder;
@@ -347,20 +387,68 @@ void VkHardwareTexture::AllocateBuffer(int w, int h, int texelsize)
VkImageTransition imageTransition;
imageTransition.addImage(&mImage, VK_IMAGE_LAYOUT_GENERAL, true);
imageTransition.execute(cmdbuffer);

bufferpitch = int(allocatedBytes / h / texelsize);
}
}

uint8_t *VkHardwareTexture::MapBuffer()
{
if (!mappedSWFB)
mappedSWFB = (uint8_t*)mImage.Image->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
{
if (mTransferBuffer)
mappedSWFB = (uint8_t*)mTransferBuffer->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
else
mappedSWFB = (uint8_t*)mImage.Image->Map(0, mImage.Image->width * mImage.Image->height * mTexelsize);
}
return mappedSWFB;
}

unsigned int VkHardwareTexture::CreateTexture(unsigned char * buffer, int w, int h, int texunit, bool mipmap, int translation, const char *name)
{
if (mTransferBuffer)
{
auto fb = GetVulkanFrameBuffer();
auto copyqueue = fb->GetCopyQueueCommands();

// Acquire image, transfer buffer via copy queue (PCIe DMA), release image

PipelineBarrier barrier0;
barrier0.addQueueTransfer(fb->device->graphicsFamily, fb->device->copyQueueTransferFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
barrier0.execute(copyqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);

VkBufferImageCopy region = {};
region.imageExtent.width = mTransferImage->width;
region.imageExtent.height = mTransferImage->height;
region.imageExtent.depth = 1;
region.imageSubresource.mipLevel = 0;
region.imageSubresource.layerCount = 1;
region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
copyqueue->copyBufferToImage(mTransferBuffer->buffer, mTransferImage->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);

PipelineBarrier barrier1;
barrier1.addQueueTransfer(fb->device->copyQueueTransferFamily, fb->device->graphicsFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
barrier1.execute(copyqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);

// Acquire image on graphics queue, make a copy of it (on the GPU), then release the image again back to the copy queue

auto gfxqueue = fb->GetTransferCommands();

PipelineBarrier barrier2;
barrier2.addImage(mImage.Image.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
barrier2.addQueueTransfer(fb->device->copyQueueTransferFamily, fb->device->graphicsFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
barrier2.execute(gfxqueue, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);

VkImageCopy imgregion = {};
imgregion.extent = region.imageExtent;
imgregion.srcSubresource = region.imageSubresource;
imgregion.dstSubresource = region.imageSubresource;
gfxqueue->copyImage(mTransferImage->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, mImage.Image->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &imgregion);

PipelineBarrier barrier3;
barrier3.addImage(mImage.Image.get(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, mImage.Layout, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
barrier3.addQueueTransfer(fb->device->graphicsFamily, fb->device->copyQueueTransferFamily, mTransferImage.get(), VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
barrier3.execute(gfxqueue, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
}

return 0;
}

@@ -78,5 +78,8 @@ class VkHardwareTexture : public IHardwareTexture

VkTextureImage mDepthStencil;

std::unique_ptr<VulkanBuffer> mTransferBuffer;
std::unique_ptr<VulkanImage> mTransferImage;

uint8_t* mappedSWFB = nullptr;
};

0 comments on commit 7084272

Please sign in to comment.
You can’t perform that action at this time.