Permalink
Show file tree
Hide file tree
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Merge pull request #11208 from TellowKrinkle/CPUCull
Cull vertices on the CPU
- Loading branch information
Showing
21 changed files
with
1,101 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,174 @@ | ||
| // Copyright 2022 Dolphin Emulator Project | ||
| // SPDX-License-Identifier: GPL-2.0-or-later | ||
|
|
||
| #include "VideoCommon/CPUCull.h" | ||
|
|
||
| #include "Common/Assert.h" | ||
| #include "Common/CPUDetect.h" | ||
| #include "Common/MathUtil.h" | ||
| #include "Common/MemoryUtil.h" | ||
| #include "Core/System.h" | ||
|
|
||
| #include "VideoCommon/CPMemory.h" | ||
| #include "VideoCommon/VertexManagerBase.h" | ||
| #include "VideoCommon/VertexShaderManager.h" | ||
| #include "VideoCommon/VideoConfig.h" | ||
| #include "VideoCommon/XFMemory.h" | ||
|
|
||
| // We really want things like c.w * a.x - a.w * c.x to stay symmetric, so they cancel to zero on | ||
| // degenerate triangles. Make sure the compiler doesn't optimize in fmas where not requested. | ||
| #ifdef _MSC_VER | ||
| #pragma fp_contract(off) | ||
| #else | ||
| // GCC doesn't support any in-file way to turn off fp contract yet | ||
| // Not ideal, but worst case scenario its cpu cull is worse at detecting degenerate triangles | ||
| // (Most likely to happen on arm, as we don't compile the cull code for x86 fma) | ||
| #pragma GCC diagnostic push | ||
| #pragma GCC diagnostic ignored "-Wunknown-pragmas" | ||
| #pragma STDC FP_CONTRACT OFF | ||
| #pragma GCC diagnostic pop | ||
| #endif | ||
|
|
||
| #if defined(_M_X86) || defined(_M_X86_64) | ||
| #define USE_SSE | ||
| #elif defined(_M_ARM_64) | ||
| #define USE_NEON | ||
| #else | ||
| #define NO_SIMD | ||
| #endif | ||
|
|
||
| #if defined(USE_SSE) | ||
| #include <immintrin.h> | ||
| #elif defined(USE_NEON) | ||
| #include <arm_neon.h> | ||
| #endif | ||
|
|
||
| #include "VideoCommon/CPUCullImpl.h" | ||
| #ifdef USE_SSE | ||
| #define USE_SSE3 | ||
| #include "VideoCommon/CPUCullImpl.h" | ||
| #define USE_SSE41 | ||
| #include "VideoCommon/CPUCullImpl.h" | ||
| #define USE_AVX | ||
| #include "VideoCommon/CPUCullImpl.h" | ||
| #define USE_FMA | ||
| #include "VideoCommon/CPUCullImpl.h" | ||
| #endif | ||
|
|
||
| #if defined(USE_SSE) | ||
| #if defined(__AVX__) && defined(__FMA__) | ||
| static constexpr int MIN_SSE = 51; | ||
| #elif defined(__AVX__) | ||
| static constexpr int MIN_SSE = 50; | ||
| #elif defined(__SSE4_1__) | ||
| static constexpr int MIN_SSE = 41; | ||
| #elif defined(__SSE3__) | ||
| static constexpr int MIN_SSE = 30; | ||
| #else | ||
| static constexpr int MIN_SSE = 0; | ||
| #endif | ||
| #endif | ||
|
|
||
| template <bool PositionHas3Elems, bool PerVertexPosMtx> | ||
| static CPUCull::TransformFunction GetTransformFunction() | ||
| { | ||
| #if defined(USE_SSE) | ||
| if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA)) | ||
| return CPUCull_FMA::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| else if (MIN_SSE >= 50 || cpu_info.bAVX) | ||
| return CPUCull_AVX::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1)) | ||
| return CPUCull_SSE41::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3)) | ||
| return CPUCull_SSE3::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| else | ||
| return CPUCull_SSE::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| #elif defined(USE_NEON) | ||
| return CPUCull_NEON::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| #else | ||
| return CPUCull_Scalar::TransformVertices<PositionHas3Elems, PerVertexPosMtx>; | ||
| #endif | ||
| } | ||
|
|
||
| template <OpcodeDecoder::Primitive Primitive, CullMode Mode> | ||
| static CPUCull::CullFunction GetCullFunction0() | ||
| { | ||
| #if defined(USE_SSE) | ||
| // Note: AVX version only actually AVX on compilers that support __attribute__((target)) | ||
| // Sorry, MSVC + Sandy Bridge. (Ivy+ and AMD see very little benefit thanks to mov elimination) | ||
| if (MIN_SSE >= 50 || cpu_info.bAVX) | ||
| return CPUCull_AVX::AreAllVerticesCulled<Primitive, Mode>; | ||
| else if (MIN_SSE >= 30 || cpu_info.bSSE3) | ||
| return CPUCull_SSE3::AreAllVerticesCulled<Primitive, Mode>; | ||
| else | ||
| return CPUCull_SSE::AreAllVerticesCulled<Primitive, Mode>; | ||
| #elif defined(USE_NEON) | ||
| return CPUCull_NEON::AreAllVerticesCulled<Primitive, Mode>; | ||
| #else | ||
| return CPUCull_Scalar::AreAllVerticesCulled<Primitive, Mode>; | ||
| #endif | ||
| } | ||
|
|
||
| template <OpcodeDecoder::Primitive Primitive> | ||
| static Common::EnumMap<CPUCull::CullFunction, CullMode::All> GetCullFunction1() | ||
| { | ||
| return { | ||
| GetCullFunction0<Primitive, CullMode::None>(), | ||
| GetCullFunction0<Primitive, CullMode::Back>(), | ||
| GetCullFunction0<Primitive, CullMode::Front>(), | ||
| GetCullFunction0<Primitive, CullMode::All>(), | ||
| }; | ||
| } | ||
|
|
||
| CPUCull::~CPUCull() = default; | ||
|
|
||
| void CPUCull::Init() | ||
| { | ||
| m_transform_table[false][false] = GetTransformFunction<false, false>(); | ||
| m_transform_table[false][true] = GetTransformFunction<false, true>(); | ||
| m_transform_table[true][false] = GetTransformFunction<true, false>(); | ||
| m_transform_table[true][true] = GetTransformFunction<true, true>(); | ||
| using Prim = OpcodeDecoder::Primitive; | ||
| m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1<Prim::GX_DRAW_QUADS>(); | ||
| m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1<Prim::GX_DRAW_QUADS>(); | ||
| m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1<Prim::GX_DRAW_TRIANGLES>(); | ||
| m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_STRIP>(); | ||
| m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_FAN>(); | ||
| } | ||
|
|
||
| bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive, | ||
| const u8* src, u32 count) | ||
| { | ||
| ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES, | ||
| "CPUCull should not be called on lines or points"); | ||
| const u32 stride = loader->m_native_vtx_decl.stride; | ||
| const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3; | ||
| const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable; | ||
| if (m_transform_buffer_size < count) [[unlikely]] | ||
| { | ||
| u32 new_size = MathUtil::NextPowerOf2(count); | ||
| m_transform_buffer_size = new_size; | ||
| m_transform_buffer.reset(static_cast<TransformedVertex*>( | ||
| Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32))); | ||
| } | ||
|
|
||
| // transform functions need the projection matrix to tranform to clip space | ||
| Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix(); | ||
|
|
||
| static constexpr Common::EnumMap<CullMode, CullMode::All> cullmode_invert = { | ||
| CullMode::None, CullMode::Front, CullMode::Back, CullMode::All}; | ||
|
|
||
| CullMode cullmode = bpmem.genMode.cullmode; | ||
| if (xfmem.viewport.ht > 0) // See videosoftware Clipper.cpp:IsBackface | ||
| cullmode = cullmode_invert[cullmode]; | ||
| const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx]; | ||
| transform(m_transform_buffer.get(), src, stride, count); | ||
| const CullFunction cull = m_cull_table[primitive][cullmode]; | ||
| return cull(m_transform_buffer.get(), count); | ||
| } | ||
|
|
||
| template <typename T> | ||
| void CPUCull::BufferDeleter<T>::operator()(T* ptr) | ||
| { | ||
| Common::FreeAlignedMemory(ptr); | ||
| } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| // Copyright 2022 Dolphin Emulator Project | ||
| // SPDX-License-Identifier: GPL-2.0-or-later | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "VideoCommon/BPMemory.h" | ||
| #include "VideoCommon/DataReader.h" | ||
| #include "VideoCommon/OpcodeDecoding.h" | ||
|
|
||
| class CPUCull | ||
| { | ||
| public: | ||
| ~CPUCull(); | ||
| void Init(); | ||
| bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive, | ||
| const u8* src, u32 count); | ||
|
|
||
| struct alignas(16) TransformedVertex | ||
| { | ||
| float x, y, z, w; | ||
| }; | ||
|
|
||
| using TransformFunction = void (*)(void*, const void*, u32, int); | ||
| using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int); | ||
|
|
||
| private: | ||
| template <typename T> | ||
| struct BufferDeleter | ||
| { | ||
| void operator()(T* ptr); | ||
| }; | ||
| std::unique_ptr<TransformedVertex[], BufferDeleter<TransformedVertex>> m_transform_buffer; | ||
| u32 m_transform_buffer_size = 0; | ||
| std::array<std::array<TransformFunction, 2>, 2> m_transform_table; | ||
| Common::EnumMap<Common::EnumMap<CullFunction, CullMode::All>, | ||
| OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN> | ||
| m_cull_table; | ||
| }; |
Oops, something went wrong.