25 changes: 23 additions & 2 deletions Source/Core/VideoBackends/Metal/MTLTexture.mm
Expand Up @@ -6,6 +6,7 @@
#include "Common/Align.h"
#include "Common/Assert.h"

#include "VideoBackends/Metal/MTLRenderer.h"
#include "VideoBackends/Metal/MTLStateTracker.h"

Metal::Texture::Texture(MRCOwned<id<MTLTexture>> tex, const TextureConfig& config)
Expand Down Expand Up @@ -50,6 +51,10 @@
g_state_tracker->ResolveTexture(src_tex, m_tex, layer, level);
}

// Use a temporary texture for large texture loads
// (Since the main upload buffer doesn't shrink after it grows)
static constexpr u32 STAGING_TEXTURE_UPLOAD_THRESHOLD = 1024 * 1024 * 4;

void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length, //
const u8* buffer, size_t buffer_size)
{
Expand All @@ -59,8 +64,23 @@
const u32 num_rows = Common::AlignUp(height, block_size) / block_size;
const u32 source_pitch = CalculateStrideForFormat(m_config.format, row_length);
const u32 upload_size = source_pitch * num_rows;
StateTracker::Map map = g_state_tracker->Allocate(StateTracker::UploadBuffer::TextureData,
upload_size, StateTracker::AlignMask::Other);
MRCOwned<id<MTLBuffer>> tmp_buffer;
StateTracker::Map map;
if (upload_size > STAGING_TEXTURE_UPLOAD_THRESHOLD)
{
tmp_buffer = MRCTransfer([g_device
newBufferWithLength:upload_size
options:MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined]);
[tmp_buffer setLabel:@"Temp Texture Upload"];
map.gpu_buffer = tmp_buffer;
map.gpu_offset = 0;
map.cpu_buffer = [tmp_buffer contents];
}
else
{
map = g_state_tracker->AllocateForTextureUpload(upload_size);
}

memcpy(map.cpu_buffer, buffer, upload_size);
id<MTLBlitCommandEncoder> encoder = g_state_tracker->GetTextureUploadEncoder();
[encoder copyFromBuffer:map.gpu_buffer
Expand Down Expand Up @@ -163,6 +183,7 @@
{
// Flush while we wait, since who knows how long we'll be sitting here
g_state_tracker->FlushEncoders();
g_state_tracker->NotifyOfCPUGPUSync();
[m_wait_buffer waitUntilCompleted];
}
m_wait_buffer = nullptr;
Expand Down
4 changes: 4 additions & 0 deletions Source/Core/VideoBackends/Metal/MTLUtil.h
Expand Up @@ -16,6 +16,10 @@ namespace Metal
{
struct DeviceFeatures
{
/// Manually copy buffer data to the GPU (instead of letting the GPU read from system memory)
/// On discrete GPUs, this tends to be faster if the copy is able to operate in parallel with a
/// previous render. This is the case unless a game uses features like bbox or texture downloads.
bool manual_buffer_upload;
bool subgroup_ops;
};

Expand Down
55 changes: 51 additions & 4 deletions Source/Core/VideoBackends/Metal/MTLUtil.mm
Expand Up @@ -217,6 +217,27 @@ fragment float4 fbfetch_test(float4 in [[color(0), raster_order_group(0)]]) {
config->backend_info.AAModes.push_back(i);
}

switch (config->iManuallyUploadBuffers)
{
case TriState::Off:
g_features.manual_buffer_upload = false;
break;
case TriState::On:
g_features.manual_buffer_upload = true;
break;
case TriState::Auto:
#if TARGET_OS_OSX
g_features.manual_buffer_upload = false;
if (@available(macOS 10.15, *))
if (![device hasUnifiedMemory])
g_features.manual_buffer_upload = true;
#else
// All iOS devices have unified memory
g_features.manual_buffer_upload = false;
#endif
break;
}

g_features.subgroup_ops = false;
if (@available(macOS 10.15, iOS 13, *))
{
Expand All @@ -225,7 +246,7 @@ fragment float4 fbfetch_test(float4 in [[color(0), raster_order_group(0)]]) {
[device supportsFamily:MTLGPUFamilyMac2] || [device supportsFamily:MTLGPUFamilyApple6];
config->backend_info.bSupportsFramebufferFetch = [device supportsFamily:MTLGPUFamilyApple1];
}
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID))
if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS))
g_features.subgroup_ops = false;
#if TARGET_OS_OSX
if (@available(macOS 11, *))
Expand Down Expand Up @@ -378,6 +399,12 @@ fragment float4 fbfetch_test(float4 in [[color(0), raster_order_group(0)]]) {
// These are usually when the compiler doesn't think a switch is exhaustive
"#pragma clang diagnostic ignored \"-Wreturn-type\"\n";

static constexpr std::pair<std::string_view, std::string_view> MSL_FIXUPS[] = {
// Force-unroll the lighting loop in ubershaders, which greatly reduces register pressure on AMD
{"for (uint chan = 0u; chan < 2u; chan++)",
"_Pragma(\"unroll\") for (uint chan = 0u; chan < 2u; chan++)"},
};

static constexpr spirv_cross::MSLResourceBinding
MakeResourceBinding(spv::ExecutionModel stage, u32 set, u32 binding, //
u32 msl_buffer, u32 msl_texture, u32 msl_sampler)
Expand Down Expand Up @@ -474,7 +501,27 @@ fragment float4 fbfetch_test(float4 in [[color(0), raster_order_group(0)]]) {
for (auto& binding : resource_bindings)
compiler.add_msl_resource_binding(binding);

std::string msl(MSL_HEADER);
msl += compiler.compile();
return msl;
std::string output(MSL_HEADER);
std::string compiled = compiler.compile();
std::string_view remaining = compiled;
while (!remaining.empty())
{
// Apply fixups
std::string_view piece = remaining;
std::string_view fixup_piece = {};
size_t next = piece.size();
for (const auto& fixup : MSL_FIXUPS)
{
size_t found = piece.find(fixup.first);
if (found == std::string_view::npos)
continue;
piece = piece.substr(0, found);
fixup_piece = fixup.second;
next = found + fixup.first.size();
}
output += piece;
output += fixup_piece;
remaining = remaining.substr(next);
}
return output;
}
2 changes: 1 addition & 1 deletion Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
Expand Up @@ -918,7 +918,7 @@ void VulkanContext::PopulateShaderSubgroupSupport()
m_supports_shader_subgroup_operations =
(subgroup_properties.supportedOperations & required_operations) == required_operations &&
subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT &&
!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID);
!DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS);
}

bool VulkanContext::SupportsExclusiveFullscreen(const WindowSystemInfo& wsi, VkSurfaceKHR surface)
Expand Down
12 changes: 8 additions & 4 deletions Source/Core/VideoCommon/DriverDetails.cpp
Expand Up @@ -132,10 +132,14 @@ constexpr BugInfo m_known_bugs[] = {
-1.0, -1.0, true},
{API_VULKAN, OS_ALL, VENDOR_ARM, DRIVER_ARM, Family::UNKNOWN, BUG_BROKEN_VECTOR_BITWISE_AND,
-1.0, -1.0, true},
{API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN,
BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
{API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN,
BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
{API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
-1.0, -1.0, true},
{API_VULKAN, OS_OSX, VENDOR_INTEL, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
-1.0, -1.0, true},
{API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
-1.0, true},
{API_METAL, OS_OSX, VENDOR_INTEL, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
-1.0, true},
{API_OPENGL, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
BUG_BROKEN_MULTITHREADED_SHADER_PRECOMPILATION, -1.0, -1.0, true},
{API_VULKAN, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
Expand Down
9 changes: 7 additions & 2 deletions Source/Core/VideoCommon/DriverDetails.h
Expand Up @@ -306,10 +306,15 @@ enum Bug
BUG_BROKEN_VECTOR_BITWISE_AND,

// BUG: Accessing gl_SubgroupInvocationID causes the Metal shader compiler to crash.
// Affected devices: AMD (macOS)
// Affected devices: AMD (older macOS)
// BUG: gl_HelperInvocation always returns true, even for non-helper invocations
// Affected devices: AMD (newer macOS)
// BUG: Using subgroupMax in a shader that can discard results in garbage data
// (For some reason, this only happens at 4x+ IR on Metal, but 2x+ IR on MoltenVK)
// Affected devices: Intel (macOS)
// Started version: -1
// Ended version: -1
BUG_BROKEN_SUBGROUP_INVOCATION_ID,
BUG_BROKEN_SUBGROUP_OPS,

// BUG: Multi-threaded shader pre-compilation sometimes crashes
// Used primarily in Videoconfig.cpp's GetNumAutoShaderPreCompilerThreads()
Expand Down
2 changes: 2 additions & 0 deletions Source/Core/VideoCommon/VideoConfig.cpp
Expand Up @@ -55,6 +55,8 @@ void VideoConfig::Refresh()

bVSync = Config::Get(Config::GFX_VSYNC);
iAdapter = Config::Get(Config::GFX_ADAPTER);
iManuallyUploadBuffers = Config::Get(Config::GFX_MTL_MANUALLY_UPLOAD_BUFFERS);
bUsePresentDrawable = Config::Get(Config::GFX_MTL_USE_PRESENT_DRAWABLE);

bWidescreenHack = Config::Get(Config::GFX_WIDESCREEN_HACK);
aspect_mode = Config::Get(Config::GFX_ASPECT_RATIO);
Expand Down
11 changes: 11 additions & 0 deletions Source/Core/VideoCommon/VideoConfig.h
Expand Up @@ -45,6 +45,13 @@ enum class ShaderCompilationMode : int
AsynchronousSkipRendering
};

enum class TriState : int
{
Off,
On,
Auto
};

// NEVER inherit from this class.
struct VideoConfig final
{
Expand Down Expand Up @@ -149,6 +156,10 @@ struct VideoConfig final
// D3D only config, mostly to be merged into the above
int iAdapter = 0;

// Metal only config
TriState iManuallyUploadBuffers = TriState::Auto;
bool bUsePresentDrawable = false;

// Enable API validation layers, currently only supported with Vulkan.
bool bEnableValidationLayer = false;

Expand Down