Skip to content

Commit

Permalink
Destroy OpenMP
Browse files Browse the repository at this point in the history
  • Loading branch information
magcius committed Nov 7, 2014
1 parent a1b44a9 commit 44b879d
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 68 deletions.
14 changes: 0 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ option(ENABLE_PCH "Use PCH to speed up compilation" ON)
option(ENABLE_LTO "Enables Link Time Optimization" OFF)
option(ENABLE_GENERIC "Enables generic build that should run on any little-endian host" OFF)

option(OPENMP "Enable OpenMP parallelization" ON)
option(ENCODE_FRAMEDUMPS "Encode framedumps in AVI format" ON)

option(FASTLOG "Enable all logs" OFF)
Expand Down Expand Up @@ -345,19 +344,6 @@ add_definitions(-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE)
include(CheckLib)
include(CheckCXXSourceRuns)

if(OPENMP)
include(FindOpenMP OPTIONAL)
if(OPENMP_FOUND)
message("OpenMP parallelization enabled")
add_definitions("${OpenMP_CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()
endif()
if(NOT OPENMP_FOUND)
add_definitions(-Wno-unknown-pragmas)
message("OpenMP parallelization disabled")
endif()

if(NOT ANDROID)

include(FindOpenGL)
Expand Down
2 changes: 0 additions & 2 deletions Source/Core/DolphinWX/VideoConfigDiag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ static wxString use_ffv1_desc = wxTRANSLATE("Encode frame dumps using the FFV1 c
#endif
static wxString free_look_desc = wxTRANSLATE("This feature allows you to change the game's camera.\nMove the mouse while holding the right mouse button to pan and while holding the middle button to move.\nHold SHIFT and press one of the WASD keys to move the camera by a certain step distance (SHIFT+0 to move faster and SHIFT+9 to move slower). Press SHIFT+R to reset the camera.\n\nIf unsure, leave this unchecked.");
static wxString crop_desc = wxTRANSLATE("Crop the picture from 4:3 to 5:4 or from 16:9 to 16:10.\n\nIf unsure, leave this unchecked.");
static wxString omp_desc = wxTRANSLATE("Use multiple threads to decode textures.\nMight result in a speedup (especially on CPUs with more than two cores).\n\nIf unsure, leave this unchecked.");
static wxString ppshader_desc = wxTRANSLATE("Apply a post-processing effect after finishing a frame.\n\nIf unsure, select (off).");
static wxString cache_efb_copies_desc = wxTRANSLATE("Slightly speeds up EFB to RAM copies by sacrificing emulation accuracy.\nSometimes also increases visual quality.\nIf you're experiencing any issues, try raising texture cache accuracy or disable this option.\n\nIf unsure, leave this unchecked.");
static wxString shader_errors_desc = wxTRANSLATE("Usually if shader compilation fails, an error message is displayed.\nHowever, one may skip the popups to allow interruption free gameplay by checking this option.\n\nIf unsure, leave this unchecked.");
Expand Down Expand Up @@ -515,7 +514,6 @@ VideoConfigDiag::VideoConfigDiag(wxWindow* parent, const std::string &title, con
{
wxGridSizer* const szr_other = new wxGridSizer(2, 5, 5);
szr_other->Add(CreateCheckBox(page_hacks, _("Disable Destination Alpha"), wxGetTranslation(disable_dstalpha_desc), vconfig.bDstAlphaPass));
szr_other->Add(CreateCheckBox(page_hacks, _("OpenMP Texture Decoder"), wxGetTranslation(omp_desc), vconfig.bOMPDecoder));
szr_other->Add(CreateCheckBox(page_hacks, _("Fast Depth Calculation"), wxGetTranslation(fast_depth_calc_desc), vconfig.bFastDepthCalc));

wxStaticBoxSizer* const group_other = new wxStaticBoxSizer(wxVERTICAL, page_hacks, _("Other"));
Expand Down
46 changes: 0 additions & 46 deletions Source/Core/VideoCommon/TextureDecoder_x64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@
#include "VideoCommon/TextureDecoder.h"
#include "VideoCommon/VideoConfig.h"

#ifdef _OPENMP
#include <omp.h>
#elif defined __GNUC__
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#endif

#if _M_SSE >= 0x401
#include <smmintrin.h>
#include <emmintrin.h>
Expand Down Expand Up @@ -234,22 +228,6 @@ static void DecodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch)
}
#endif

static inline void SetOpenMPThreadCount(int width, int height)
{
#ifdef _OPENMP
// Don't use multithreading in small Textures
if (g_ActiveConfig.bOMPDecoder && width > 127 && height > 127)
{
// don't span to many threads they will kill the rest of the emu :)
omp_set_num_threads((omp_get_num_procs() + 2) / 3);
}
else
{
omp_set_num_threads(1);
}
#endif
}

// JSD 01/06/11:
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
Expand All @@ -260,8 +238,6 @@ static inline void SetOpenMPThreadCount(int width, int height)

PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt)
{
SetOpenMPThreadCount(width, height);

const int Wsteps4 = (width + 3) / 4;
const int Wsteps8 = (width + 7) / 8;

Expand All @@ -270,15 +246,13 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C4:
if (tlutfmt == GX_TL_RGB5A3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
DecodeBytes_C4_RGB5A3(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
}
else if (tlutfmt == GX_TL_IA8)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
Expand All @@ -287,7 +261,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
}
else if (tlutfmt == GX_TL_RGB565)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++,xStep++)
Expand All @@ -307,7 +280,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i maskB3A2 = _mm_set_epi8(11,11,11,11,3,3,3,3,10,10,10,10,2,2,2,2);
const __m128i maskD5C4 = _mm_set_epi8(13,13,13,13,5,5,5,5,12,12,12,12,4,4,4,4);
const __m128i maskF7E6 = _mm_set_epi8(15,15,15,15,7,7,7,7,14,14,14,14,6,6,6,6);
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2,xStep++)
Expand Down Expand Up @@ -343,7 +315,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics.
// Produces a ~76% speed improvement over reference C implementation.
{
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8 ; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
Expand Down Expand Up @@ -415,7 +386,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
Expand All @@ -441,7 +411,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics.
// Produces an ~86% speed improvement over reference C implementation.
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8,yStep++)
{
Expand Down Expand Up @@ -527,15 +496,13 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C8:
if (tlutfmt == GX_TL_RGB5A3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C8_RGB5A3((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlut);
}
else if (tlutfmt == GX_TL_IA8)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand All @@ -544,7 +511,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
}
else if (tlutfmt == GX_TL_RGB565)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand All @@ -554,7 +520,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
break;
case GX_TF_IA4:
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand All @@ -568,7 +533,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces an ~50% speed improvement over SSE2 implementation.
if (cpu_info.bSSSE3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand All @@ -590,7 +554,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand Down Expand Up @@ -639,23 +602,20 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
case GX_TF_C14X2:
if (tlutfmt == GX_TL_RGB5A3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C14X2_RGB5A3(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
}
else if (tlutfmt == GX_TL_IA8)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C14X2_IA8(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
}
else if (tlutfmt == GX_TL_RGB565)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand All @@ -671,7 +631,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
const __m128i kMaskG1 = _mm_set1_epi32(0x00000300);
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand Down Expand Up @@ -748,7 +707,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand Down Expand Up @@ -841,7 +799,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics (2 in 4 cases)
// Produces a ~25% speed improvement over reference C implementation.
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
Expand Down Expand Up @@ -955,7 +912,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~30% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
Expand Down Expand Up @@ -987,7 +943,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// JSD optimized with SSE2 intrinsics
// Produces a ~68% speed improvement over reference C implementation.
{
#pragma omp parallel for
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
Expand Down Expand Up @@ -1091,7 +1046,6 @@ PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int he
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
// faster than both.
#pragma omp parallel for
for (int y = 0; y < height; y += 8)
{
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8,yStep++)
Expand Down
3 changes: 0 additions & 3 deletions Source/Core/VideoCommon/VideoConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ void VideoConfig::Load(const std::string& ini_file)
settings->Get("TexFmtOverlayCenter", &bTexFmtOverlayCenter, 0);
settings->Get("WireFrame", &bWireFrame, 0);
settings->Get("DisableFog", &bDisableFog, 0);
settings->Get("OMPDecoder", &bOMPDecoder, false);
settings->Get("EnableShaderDebugging", &bEnableShaderDebugging, false);
settings->Get("BorderlessFullscreen", &bBorderlessFullscreen, false);

Expand Down Expand Up @@ -176,7 +175,6 @@ void VideoConfig::GameIniLoad()

CHECK_SETTING("Video_Settings", "DstAlphaPass", bDstAlphaPass);
CHECK_SETTING("Video_Settings", "DisableFog", bDisableFog);
CHECK_SETTING("Video_Settings", "OMPDecoder", bOMPDecoder);

CHECK_SETTING("Video_Enhancements", "ForceFiltering", bForceFiltering);
CHECK_SETTING("Video_Enhancements", "MaxAnisotropy", iMaxAnisotropy); // NOTE - this is x in (1 << x)
Expand Down Expand Up @@ -245,7 +243,6 @@ void VideoConfig::Save(const std::string& ini_file)
settings->Set("Wireframe", bWireFrame);
settings->Set("DstAlphaPass", bDstAlphaPass);
settings->Set("DisableFog", bDisableFog);
settings->Set("OMPDecoder", bOMPDecoder);
settings->Set("EnableShaderDebugging", bEnableShaderDebugging);
settings->Set("BorderlessFullscreen", bBorderlessFullscreen);

Expand Down
3 changes: 0 additions & 3 deletions Source/Core/VideoCommon/VideoConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ struct VideoConfig final
bool bUseXFB;
bool bUseRealXFB;

// OpenMP
bool bOMPDecoder;

// Enhancements
int iMultisampleMode;
int iEFBScale;
Expand Down

0 comments on commit 44b879d

Please sign in to comment.