Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #11208 from TellowKrinkle/CPUCull
Cull vertices on the CPU
  • Loading branch information
delroth committed Jan 26, 2023
2 parents 09a8d95 + 7413be1 commit 9c9310b
Show file tree
Hide file tree
Showing 21 changed files with 1,101 additions and 91 deletions.
Expand Up @@ -214,6 +214,7 @@ public enum BooleanSetting implements AbstractBooleanSetting
"SaveTextureCacheToState", true),
GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS,
"PreferVSForLinePointExpansion", false),
GFX_CPU_CULL(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "CPUCull", false),
GFX_MODS_ENABLE(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "EnableMods", false),

GFX_ENHANCE_FORCE_FILTERING(Settings.FILE_GFX, Settings.SECTION_GFX_ENHANCEMENTS,
Expand Down
Expand Up @@ -938,6 +938,8 @@ private void addAdvancedGraphicsSettings(ArrayList<SettingsItem> sl)
sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION,
R.string.prefer_vs_for_point_line_expansion,
R.string.prefer_vs_for_point_line_expansion_description));
sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_CPU_CULL, R.string.cpu_cull,
R.string.cpu_cull_description));
sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_HACK_EFB_DEFER_INVALIDATION,
R.string.defer_efb_invalidation, R.string.defer_efb_invalidation_description));
sl.add(new InvertedSwitchSetting(mContext, BooleanSetting.GFX_HACK_FAST_TEXTURE_SAMPLING,
Expand Down
2 changes: 2 additions & 0 deletions Source/Android/app/src/main/res/values/strings.xml
Expand Up @@ -361,6 +361,8 @@
<string name="backend_multithreading_description">Enables graphics backend multithreading (Vulkan only). May affect performance. If unsure, leave this checked.</string>
<string name="prefer_vs_for_point_line_expansion">Prefer VS for Point/Line Expansion</string>
<string name="prefer_vs_for_point_line_expansion_description">On backends that support both using the geometry shader and the vertex shader for expanding points and lines, selects the vertex shader for the job. May affect performance.</string>
<string name="cpu_cull">Cull Vertices on the CPU</string>
<string name="cpu_cull_description">Cull vertices on the CPU to reduce the number of draw calls required. May affect performance. If unsure, leave this unchecked.</string>
<string name="defer_efb_invalidation">Defer EFB Cache Invalidation</string>
<string name="defer_efb_invalidation_description">Defers invalidation of the EFB access cache until a GPU synchronization command is executed. May improve performance in some games at the cost of stability. If unsure, leave this unchecked.</string>
<string name="manual_texture_sampling">Manual Texture Sampling</string>
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/Config/GraphicsSettings.cpp
Expand Up @@ -93,6 +93,7 @@ const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE{
{System::GFX, "Settings", "SaveTextureCacheToState"}, true};
const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{
{System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false};
const Info<bool> GFX_CPU_CULL{{System::GFX, "Settings", "CPUCull"}, false};

const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS{
{System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto};
Expand Down
1 change: 1 addition & 0 deletions Source/Core/Core/Config/GraphicsSettings.h
Expand Up @@ -82,6 +82,7 @@ extern const Info<int> GFX_SHADER_COMPILER_THREADS;
extern const Info<int> GFX_SHADER_PRECOMPILER_THREADS;
extern const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE;
extern const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION;
extern const Info<bool> GFX_CPU_CULL;

extern const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS;
extern const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE;
Expand Down
3 changes: 3 additions & 0 deletions Source/Core/DolphinLib.props
Expand Up @@ -632,6 +632,8 @@
<ClInclude Include="VideoCommon\CommandProcessor.h" />
<ClInclude Include="VideoCommon\ConstantManager.h" />
<ClInclude Include="VideoCommon\CPMemory.h" />
<ClInclude Include="VideoCommon\CPUCull.h" />
<ClInclude Include="VideoCommon\CPUCullImpl.h" />
<ClInclude Include="VideoCommon\DataReader.h" />
<ClInclude Include="VideoCommon\DriverDetails.h" />
<ClInclude Include="VideoCommon\Fifo.h" />
Expand Down Expand Up @@ -1226,6 +1228,7 @@
<ClCompile Include="VideoCommon\BPStructs.cpp" />
<ClCompile Include="VideoCommon\CommandProcessor.cpp" />
<ClCompile Include="VideoCommon\CPMemory.cpp" />
<ClCompile Include="VideoCommon\CPUCull.cpp" />
<ClCompile Include="VideoCommon\DriverDetails.cpp" />
<ClCompile Include="VideoCommon\Fifo.cpp" />
<ClCompile Include="VideoCommon\FramebufferManager.cpp" />
Expand Down
9 changes: 8 additions & 1 deletion Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
Expand Up @@ -159,16 +159,18 @@ void AdvancedWidget::CreateWidgets()
m_prefer_vs_for_point_line_expansion = new GraphicsBool(
// i18n: VS is short for vertex shaders.
tr("Prefer VS for Point/Line Expansion"), Config::GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION);
m_cpu_cull = new GraphicsBool(tr("Cull Vertices on the CPU"), Config::GFX_CPU_CULL);

misc_layout->addWidget(m_enable_cropping, 0, 0);
misc_layout->addWidget(m_enable_prog_scan, 0, 1);
misc_layout->addWidget(m_backend_multithreading, 1, 0);
misc_layout->addWidget(m_prefer_vs_for_point_line_expansion, 1, 1);
misc_layout->addWidget(m_cpu_cull, 2, 0);
#ifdef _WIN32
m_borderless_fullscreen =
new GraphicsBool(tr("Borderless Fullscreen"), Config::GFX_BORDERLESS_FULLSCREEN);

misc_layout->addWidget(m_borderless_fullscreen, 2, 0);
misc_layout->addWidget(m_borderless_fullscreen, 2, 1);
#endif

// Experimental.
Expand Down Expand Up @@ -369,6 +371,10 @@ void AdvancedWidget::AddDescriptions()
"for expanding points and lines, selects the vertex shader for the job. May "
"affect performance."
"<br><br>%1");
static const char TR_CPU_CULL_DESCRIPTION[] =
QT_TR_NOOP("Cull vertices on the CPU to reduce the number of draw calls required. "
"May affect performance and draw statistics.<br><br>"
"<dolphin_emphasis>If unsure, leave this unchecked.</dolphin_emphasis>");
static const char TR_DEFER_EFB_ACCESS_INVALIDATION_DESCRIPTION[] = QT_TR_NOOP(
"Defers invalidation of the EFB access cache until a GPU synchronization command "
"is executed. If disabled, the cache will be invalidated with every draw call. "
Expand Down Expand Up @@ -441,6 +447,7 @@ void AdvancedWidget::AddDescriptions()
vsexpand_extra = tr(IF_UNSURE_UNCHECKED);
m_prefer_vs_for_point_line_expansion->SetDescription(
tr(TR_PREFER_VS_FOR_POINT_LINE_EXPANSION_DESCRIPTION).arg(vsexpand_extra));
m_cpu_cull->SetDescription(tr(TR_CPU_CULL_DESCRIPTION));
#ifdef _WIN32
m_borderless_fullscreen->SetDescription(tr(TR_BORDERLESS_FULLSCREEN_DESCRIPTION));
#endif
Expand Down
1 change: 1 addition & 0 deletions Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
Expand Up @@ -69,6 +69,7 @@ class AdvancedWidget final : public GraphicsWidget
ToolTipCheckBox* m_enable_prog_scan;
GraphicsBool* m_backend_multithreading;
GraphicsBool* m_prefer_vs_for_point_line_expansion;
GraphicsBool* m_cpu_cull;
GraphicsBool* m_borderless_fullscreen;

// Experimental
Expand Down
3 changes: 3 additions & 0 deletions Source/Core/VideoCommon/CMakeLists.txt
Expand Up @@ -23,6 +23,9 @@ add_library(videocommon
ConstantManager.h
CPMemory.cpp
CPMemory.h
CPUCull.cpp
CPUCull.h
CPUCullImpl.h
DriverDetails.cpp
DriverDetails.h
Fifo.cpp
Expand Down
174 changes: 174 additions & 0 deletions Source/Core/VideoCommon/CPUCull.cpp
@@ -0,0 +1,174 @@
// Copyright 2022 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include "VideoCommon/CPUCull.h"

#include "Common/Assert.h"
#include "Common/CPUDetect.h"
#include "Common/MathUtil.h"
#include "Common/MemoryUtil.h"
#include "Core/System.h"

#include "VideoCommon/CPMemory.h"
#include "VideoCommon/VertexManagerBase.h"
#include "VideoCommon/VertexShaderManager.h"
#include "VideoCommon/VideoConfig.h"
#include "VideoCommon/XFMemory.h"

// We really want things like c.w * a.x - a.w * c.x to stay symmetric, so they cancel to zero on
// degenerate triangles. Make sure the compiler doesn't optimize in fmas where not requested.
#ifdef _MSC_VER
#pragma fp_contract(off)
#else
// GCC doesn't support any in-file way to turn off fp contract yet
// Not ideal, but worst case scenario its cpu cull is worse at detecting degenerate triangles
// (Most likely to happen on arm, as we don't compile the cull code for x86 fma)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#pragma STDC FP_CONTRACT OFF
#pragma GCC diagnostic pop
#endif

#if defined(_M_X86) || defined(_M_X86_64)
#define USE_SSE
#elif defined(_M_ARM_64)
#define USE_NEON
#else
#define NO_SIMD
#endif

#if defined(USE_SSE)
#include <immintrin.h>
#elif defined(USE_NEON)
#include <arm_neon.h>
#endif

#include "VideoCommon/CPUCullImpl.h"
#ifdef USE_SSE
#define USE_SSE3
#include "VideoCommon/CPUCullImpl.h"
#define USE_SSE41
#include "VideoCommon/CPUCullImpl.h"
#define USE_AVX
#include "VideoCommon/CPUCullImpl.h"
#define USE_FMA
#include "VideoCommon/CPUCullImpl.h"
#endif

#if defined(USE_SSE)
#if defined(__AVX__) && defined(__FMA__)
static constexpr int MIN_SSE = 51;
#elif defined(__AVX__)
static constexpr int MIN_SSE = 50;
#elif defined(__SSE4_1__)
static constexpr int MIN_SSE = 41;
#elif defined(__SSE3__)
static constexpr int MIN_SSE = 30;
#else
static constexpr int MIN_SSE = 0;
#endif
#endif

template <bool PositionHas3Elems, bool PerVertexPosMtx>
static CPUCull::TransformFunction GetTransformFunction()
{
#if defined(USE_SSE)
if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA))
return CPUCull_FMA::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
else if (MIN_SSE >= 50 || cpu_info.bAVX)
return CPUCull_AVX::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1))
return CPUCull_SSE41::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3))
return CPUCull_SSE3::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
else
return CPUCull_SSE::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
#elif defined(USE_NEON)
return CPUCull_NEON::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
#else
return CPUCull_Scalar::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
#endif
}

template <OpcodeDecoder::Primitive Primitive, CullMode Mode>
static CPUCull::CullFunction GetCullFunction0()
{
#if defined(USE_SSE)
// Note: AVX version only actually AVX on compilers that support __attribute__((target))
// Sorry, MSVC + Sandy Bridge. (Ivy+ and AMD see very little benefit thanks to mov elimination)
if (MIN_SSE >= 50 || cpu_info.bAVX)
return CPUCull_AVX::AreAllVerticesCulled<Primitive, Mode>;
else if (MIN_SSE >= 30 || cpu_info.bSSE3)
return CPUCull_SSE3::AreAllVerticesCulled<Primitive, Mode>;
else
return CPUCull_SSE::AreAllVerticesCulled<Primitive, Mode>;
#elif defined(USE_NEON)
return CPUCull_NEON::AreAllVerticesCulled<Primitive, Mode>;
#else
return CPUCull_Scalar::AreAllVerticesCulled<Primitive, Mode>;
#endif
}

template <OpcodeDecoder::Primitive Primitive>
static Common::EnumMap<CPUCull::CullFunction, CullMode::All> GetCullFunction1()
{
return {
GetCullFunction0<Primitive, CullMode::None>(),
GetCullFunction0<Primitive, CullMode::Back>(),
GetCullFunction0<Primitive, CullMode::Front>(),
GetCullFunction0<Primitive, CullMode::All>(),
};
}

CPUCull::~CPUCull() = default;

void CPUCull::Init()
{
m_transform_table[false][false] = GetTransformFunction<false, false>();
m_transform_table[false][true] = GetTransformFunction<false, true>();
m_transform_table[true][false] = GetTransformFunction<true, false>();
m_transform_table[true][true] = GetTransformFunction<true, true>();
using Prim = OpcodeDecoder::Primitive;
m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1<Prim::GX_DRAW_QUADS>();
m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1<Prim::GX_DRAW_QUADS>();
m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1<Prim::GX_DRAW_TRIANGLES>();
m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_STRIP>();
m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_FAN>();
}

bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
const u8* src, u32 count)
{
ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES,
"CPUCull should not be called on lines or points");
const u32 stride = loader->m_native_vtx_decl.stride;
const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3;
const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable;
if (m_transform_buffer_size < count) [[unlikely]]
{
u32 new_size = MathUtil::NextPowerOf2(count);
m_transform_buffer_size = new_size;
m_transform_buffer.reset(static_cast<TransformedVertex*>(
Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32)));
}

// transform functions need the projection matrix to tranform to clip space
Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix();

static constexpr Common::EnumMap<CullMode, CullMode::All> cullmode_invert = {
CullMode::None, CullMode::Front, CullMode::Back, CullMode::All};

CullMode cullmode = bpmem.genMode.cullmode;
if (xfmem.viewport.ht > 0) // See videosoftware Clipper.cpp:IsBackface
cullmode = cullmode_invert[cullmode];
const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx];
transform(m_transform_buffer.get(), src, stride, count);
const CullFunction cull = m_cull_table[primitive][cullmode];
return cull(m_transform_buffer.get(), count);
}

template <typename T>
void CPUCull::BufferDeleter<T>::operator()(T* ptr)
{
Common::FreeAlignedMemory(ptr);
}
38 changes: 38 additions & 0 deletions Source/Core/VideoCommon/CPUCull.h
@@ -0,0 +1,38 @@
// Copyright 2022 Dolphin Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#pragma once

#include "VideoCommon/BPMemory.h"
#include "VideoCommon/DataReader.h"
#include "VideoCommon/OpcodeDecoding.h"

class CPUCull
{
public:
~CPUCull();
void Init();
bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
const u8* src, u32 count);

struct alignas(16) TransformedVertex
{
float x, y, z, w;
};

using TransformFunction = void (*)(void*, const void*, u32, int);
using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int);

private:
template <typename T>
struct BufferDeleter
{
void operator()(T* ptr);
};
std::unique_ptr<TransformedVertex[], BufferDeleter<TransformedVertex>> m_transform_buffer;
u32 m_transform_buffer_size = 0;
std::array<std::array<TransformFunction, 2>, 2> m_transform_table;
Common::EnumMap<Common::EnumMap<CullFunction, CullMode::All>,
OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN>
m_cull_table;
};

0 comments on commit 9c9310b

Please sign in to comment.