diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..e06cf47 --- /dev/null +++ b/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +BasedOnStyle: Google +DerivePointerAlignment: false diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..19e7b5c --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,397 @@ +name: Build and Run Tests + +on: + workflow_call: + inputs: + run-tests: + description: 'Whether to also run unit tests where possible.' + default: true + required: false + type: boolean + update-caches: + description: 'Whether to update the `ccache` or `bazel` caches, where possible.' + default: false + required: false + type: boolean + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ inputs.update-caches }} + cancel-in-progress: true + +jobs: + cmake-linux-x86_64: + runs-on: ubuntu-22.04-8core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install ninja + run: sudo apt install ninja-build + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Create output directory + run: mkdir -p build + working-directory: ${{ github.workspace }} + - name: Configure and build + run: | + cmake .. -DCMAKE_BUILD_TYPE=Release -GNinja -DPTHREADPOOL_BUILD_TESTS=ON + cmake --build . -- "-j$((2*$(nproc)))" + working-directory: ${{ github.workspace }}/build + - name: Run tests + if: ${{ inputs.run-tests }} + run: ctest --output-on-failure --parallel $(nproc) + working-directory: ${{ github.workspace }}/build + + cmake-linux-aarch64: + runs-on: arm-ubuntu-arm-22.04-8core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install ninja + run: sudo apt install ninja-build + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Create output directory + run: mkdir -p build + working-directory: ${{ github.workspace }} + - name: Configure and build + run: | + set -e + cmake .. -DCMAKE_BUILD_TYPE=Release -GNinja -DPTHREADPOOL_BUILD_TESTS=ON + cmake --build . -- "-j$((2*$(nproc)))" + working-directory: ${{ github.workspace }}/build + - name: Run tests + if: ${{ inputs.run-tests }} + run: ctest --output-on-failure --parallel $(nproc) + working-directory: ${{ github.workspace }}/build + + cmake-windows-arm64: + runs-on: windows-2022-32core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Install Ninja + uses: seanmiddleditch/gha-setup-ninja@master + - name: Setup build environment + shell: bash + run: | + echo "VCVARSALL=$(vswhere -products \* -latest -property installationPath)\\VC\\Auxiliary\\Build\\vcvarsall.bat" >> $GITHUB_ENV + - name: Configure and build + run: scripts/build-windows-arm64.cmd + shell: cmd + working-directory: ${{ github.workspace }} + env: + CFLAGS: "/UNDEBUG" + CXXFLAGS: "/UNDEBUG" + + cmake-windows-x64: + runs-on: windows-2022-32core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Install Ninja + uses: seanmiddleditch/gha-setup-ninja@master + - name: Setup build environment + shell: bash + run: | + echo "VCVARSALL=$(vswhere -products \* -latest -property installationPath)\\VC\\Auxiliary\\Build\\vcvarsall.bat" >> $GITHUB_ENV + - name: Configure and build + run: scripts/build-windows-x64.cmd + shell: cmd + working-directory: ${{ github.workspace }} + env: + CFLAGS: "/UNDEBUG" + CXXFLAGS: "/UNDEBUG" + - name: Run tests + if: ${{ inputs.run-tests }} + run: ctest -C Release --output-on-failure --parallel $NUMBER_OF_PROCESSORS + working-directory: ${{ github.workspace }}/build + + cmake-windows-x86: + runs-on: windows-2022-32core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Install Ninja + uses: seanmiddleditch/gha-setup-ninja@master + - name: Setup build environment + shell: bash + run: | + echo "VCVARSALL=$(vswhere -products \* -latest -property installationPath)\\VC\\Auxiliary\\Build\\vcvarsall.bat" >> $GITHUB_ENV + - name: Configure and build + run: scripts/build-windows-x86.cmd + shell: cmd + working-directory: ${{ github.workspace }} + env: + CFLAGS: "/UNDEBUG" + CXXFLAGS: "/UNDEBUG" + - name: Run tests + if: ${{ inputs.run-tests }} + run: ctest -C Release --output-on-failure --parallel $NUMBER_OF_PROCESSORS + working-directory: ${{ github.workspace }}/build + + cmake-macos-arm64: + runs-on: macos-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Install ninja + run: brew install ninja + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Create output directory + run: mkdir -p build + working-directory: ${{ github.workspace }} + - name: Generate CMake project + run: | + cmake \ + -G Ninja \ + -DCMAKE_CONFIGURATION_TYPES=Release \ + -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DHAVE_STD_REGEX=TRUE \ + .. + working-directory: ${{ github.workspace }}/build + - name: Build with Xcode + run: | + cmake \ + --build . \ + -j$((2*$(sysctl -n hw.ncpu))) + working-directory: ${{ github.workspace }}/build + + cmake-android: + strategy: + matrix: + arch: [arm64, armv7, x86] + runs-on: ubuntu-22.04-8core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install ninja + run: sudo apt install ninja-build + - name: Setup Android NDK + id: setup-ndk + uses: nttld/setup-ndk@v1 + with: + ndk-version: r23b + add-to-path: false + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }}-${{ matrix.arch }} + max-size: "500M" + save: ${{ inputs.update-caches }} + - name: Force compiler binary mtime + # The nttld/setup-ndk action downloads the compiler binaries and copies them to the + # tools-cache, where their mtimes are set to the current time. This is bad since ccache + # uses the compiler binary mtime to determine whether two compilations match. We solve + # this problem by coercing the mtime of the compiler binaries to a fixed value. Note that + # if the compiler does indeed change, this will also cause the path to change as it would + # imply using a different NDK version. + run: | + find ${{ steps.setup-ndk.outputs.ndk-path }} -wholename '*/bin/clang*' -executable -type f,l -exec touch -h -t 202408130000 {} + + - name: Configure and build + run: scripts/build-android-${{ matrix.arch }}.sh + working-directory: ${{ github.workspace }} + env: + ANDROID_NDK: ${{ steps.setup-ndk.outputs.ndk-path }} + + cmake-ios-arm64: + runs-on: macos-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Create output directory + run: mkdir -p build + working-directory: ${{ github.workspace }} + - name: Generate CMake project + run: | + cmake \ + -G Xcode \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_ARCHITECTURES=arm64 \ + -DPTHREADPOOL_BUILD_BENCHMARKS=OFF \ + -DPTHREADPOOL_BUILD_TESTS=OFF \ + .. + working-directory: ${{ github.workspace }}/build + - name: Build with Xcode + run: cmake --build . --parallel $(sysctl -n hw.ncpu) -- -quiet + working-directory: ${{ github.workspace }}/build + + cmake-ios-x86_64: + runs-on: macos-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Create output directory + run: mkdir -p build + working-directory: ${{ github.workspace }} + - name: Generate CMake project + run: | + cmake \ + -G Xcode \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_ARCHITECTURES=x86_64 \ + -DPTHREADPOOL_BUILD_BENCHMARKS=OFF \ + -DPTHREADPOOL_BUILD_TESTS=OFF \ + .. + working-directory: ${{ github.workspace }}/build + - name: Build with Xcode + run: cmake --build . --parallel $(sysctl -n hw.ncpu) -- -sdk iphonesimulator -quiet + working-directory: ${{ github.workspace }}/build + + bazel-linux-x86_64-clang-18: + runs-on: ubuntu-22.04-8core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install clang-18 + working-directory: ${{ github.workspace }} + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 + - name: Restore bazel cache + uses: actions/cache/restore@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }} + restore-keys: | + ${{ github.job }}- + - name: Build and run tests + if: ${{ inputs.run-tests }} + env: + CC: clang-18 + CXX: clang++-18 + run: | + bazel test --test_output=errors :pthreadpool_test :pthreadpool_cxx_test + working-directory: ${{ github.workspace }} + - name: Compress disk cache + # Bazel's `--disk-cache` currently grows without bounds, so we remove files + # that haven't been accessed in 7+ days manually. + if: ${{ inputs.update-caches }} + run: find $HOME/.cache/bazel -type f -atime +7 -delete + - name: Save bazel cache + if: ${{ inputs.update-caches }} + uses: actions/cache/save@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }}-${{ github.sha }} + + bazel-linux-aarch64-clang18: + runs-on: arm-ubuntu-arm-22.04-4core + timeout-minutes: 60 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install clang-18 + working-directory: ${{ github.workspace }} + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 + - name: Restore bazel cache + uses: actions/cache/restore@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }} + restore-keys: | + ${{ github.job }}- + - name: Build and run tests + if: ${{ inputs.run-tests }} + env: + CC: clang-18 + CXX: clang++-18 + run: | + bazel test --verbose_failures --test_output=errors :pthreadpool_test :pthreadpool_cxx_test + working-directory: ${{ github.workspace }} + - name: Compress disk cache + # Bazel's `--disk-cache` currently grows without bounds, so we remove files + # that haven't been accessed in 7+ days manually. + if: ${{ inputs.update-caches }} + run: find $HOME/.cache/bazel -type f -atime +7 -delete + - name: Save bazel cache + if: ${{ inputs.update-caches }} + uses: actions/cache/save@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }}-${{ github.sha }} + + bazel-linux-x86_64-gcc-9: + runs-on: ubuntu-22.04-8core + timeout-minutes: 60 + env: + CC: gcc-9 + CXX: g++-9 + steps: + - uses: actions/checkout@v4 + - name: Update apt + run: sudo apt update + - name: Install gcc-9 + working-directory: ${{ github.workspace }} + run: | + sudo apt install gcc-9 g++-9 + - name: Restore bazel cache + uses: actions/cache/restore@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }} + restore-keys: | + ${{ github.job }}- + - name: Build tests + run: | + bazel build ${BAZEL_DEFINES} :pthreadpool_test :pthreadpool_cxx_test + working-directory: ${{ github.workspace }} + - name: Run tests + if: ${{ inputs.run-tests }} + run: | + bazel test ${BAZEL_DEFINES} --test_output=errors :pthreadpool_test :pthreadpool_cxx_test + working-directory: ${{ github.workspace }} + - name: Compress disk cache + # Bazel's `--disk-cache` currently grows without bounds, so we remove files + # that haven't been accessed in 7+ days manually. + if: ${{ inputs.update-caches }} + run: find $HOME/.cache/bazel -type f -atime +7 -delete + - name: Save bazel cache + if: ${{ inputs.update-caches }} + uses: actions/cache/save@v4 + with: + path: "/home/runner/.cache/bazel" + key: ${{ github.job }}-${{ github.sha }} diff --git a/.github/workflows/on-pr-approved.yml b/.github/workflows/on-pr-approved.yml new file mode 100644 index 0000000..d765cda --- /dev/null +++ b/.github/workflows/on-pr-approved.yml @@ -0,0 +1,9 @@ +name: Test Pull Request +run-name: ${{ github.event.pull_request.title }} +on: + pull_request_review: + types: [submitted] +jobs: + build-and-test: + if: github.event.review.state == 'APPROVED' + uses: ./.github/workflows/build.yml diff --git a/.github/workflows/on-pr-merge-to-main.yml b/.github/workflows/on-pr-merge-to-main.yml new file mode 100644 index 0000000..23e8abc --- /dev/null +++ b/.github/workflows/on-pr-merge-to-main.yml @@ -0,0 +1,13 @@ +name: Build and update caches +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + build-and-test: + uses: ./.github/workflows/build.yml + with: + run-tests: false + update-caches: true diff --git a/.github/workflows/on-push.yml b/.github/workflows/on-push.yml new file mode 100644 index 0000000..64d6f69 --- /dev/null +++ b/.github/workflows/on-push.yml @@ -0,0 +1,11 @@ +name: Test Latest Push + +on: + push: + branches-ignore: + - main + workflow_dispatch: + +jobs: + build-and-test: + uses: ./.github/workflows/build.yml diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 0d8a9fd..0000000 --- a/.gitignore +++ /dev/null @@ -1,27 +0,0 @@ -# Ninja files -build.ninja - -# Build objects and artifacts -bazel-bin -bazel-genfiles -bazel-out -bazel-testlogs -bazel-pthreadpool -bin/ -build/ -build-*/ -deps/ -lib/ -libs/ -obj/ -*.pyc -*.pyo - -# System files -.DS_Store -.DS_Store? -._* -.Spotlight-V100 -.Trashes -ehthumbs.db -Thumbs.db diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index faff03c..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -dist: xenial -language: c -script: - - mkdir build - - cd build - - cmake .. -G Ninja - - ninja - - ctest --verbose -addons: - apt: - packages: - - ninja-build diff --git a/BUILD.bazel b/BUILD.bazel index adea02a..396537f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -64,8 +64,11 @@ cc_library( ":tvos_x86_64": ARCH_SPECIFIC_SRCS, "//conditions:default": [], }), + hdrs = [ + "include/pthreadpool.h", + ], copts = [ - "-std=gnu11", + "-std=c11", ] + select({ ":optimized_build": ["-O2"], "//conditions:default": [], @@ -120,9 +123,6 @@ cc_library( ":tvos_x86_64": ["-DPTHREADPOOL_USE_FASTPATH=1"], "//conditions:default": ["-DPTHREADPOOL_USE_FASTPATH=0"], }), - hdrs = [ - "include/pthreadpool.h", - ], defines = [ "PTHREADPOOL_NO_DEPRECATED_API", ], @@ -137,6 +137,7 @@ cc_library( "//conditions:default": [], }), strip_include_prefix = "include", + visibility = ["//visibility:public"], deps = [ "@FXdiv", ] + select({ @@ -149,7 +150,6 @@ cc_library( ":android_arm64": ["@cpuinfo"], "//conditions:default": [], }), - visibility = ["//visibility:public"], ) ################################## Unit tests ################################## @@ -349,7 +349,6 @@ config_setting( config_setting( name = "ios", values = { - "crosstool_top": "@bazel_tools//tools/cpp:toolchain", "apple_platform_type": "ios", }, ) @@ -373,7 +372,6 @@ config_setting( config_setting( name = "watchos", values = { - "crosstool_top": "@bazel_tools//tools/cpp:toolchain", "apple_platform_type": "watchos", }, ) @@ -397,7 +395,6 @@ config_setting( config_setting( name = "tvos", values = { - "crosstool_top": "@bazel_tools//tools/cpp:toolchain", "apple_platform_type": "tvos", }, ) @@ -428,7 +425,7 @@ config_setting( name = "emscripten", values = { "crosstool_top": "//toolchain:emscripten", - } + }, ) config_setting( @@ -436,5 +433,5 @@ config_setting( values = { "crosstool_top": "//toolchain:emscripten", "copt": "-pthread", - } + }, ) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fa4285..efff8cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,45 @@ +# Copyright (c) 2017 Facebook Inc. +# Copyright (c) 2015-2017 Georgia Institute of Technology +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) # ---[ Project PROJECT(pthreadpool C) +# --[ Use ccache if available +FIND_PROGRAM(CCACHE_BINARY "ccache") +IF(CCACHE_BINARY) + MESSAGE(STATUS "Using ccache: ${CCACHE_BINARY}") + SET(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_BINARY}" CACHE STRING "CXX compiler launcher" FORCE) + SET(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_BINARY}" CACHE STRING "C compiler launcher" FORCE) + IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + STRING(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_DEBUG_INIT "${CMAKE_C_FLAGS_DEBU_INITG}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_DEBUG_INIT "${CMAKE_CXX_FLAGS_DEBUG_INIT}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "${CMAKE_C_FLAGS_RELWITHDEBINFO_INIT}") + STRING(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "${CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT}") + ENDIF() +ENDIF() + +# ---[ Language options. +SET(CMAKE_C_STANDARD 11) +SET(CMAKE_C_EXTENSIONS NO) +SET(CMAKE_CXX_STANDARD 11) +SET(CMAKE_CXX_EXTENSIONS NO) +IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + STRING(APPEND CMAKE_C_FLAGS " /experimental:c11atomics") + STRING(APPEND CMAKE_CXX_FLAGS " /experimental:c11atomics") +ENDIF() + # ---[ Options. SET(PTHREADPOOL_LIBRARY_TYPE "default" CACHE STRING "Type of library (shared, static, or default) to build") SET_PROPERTY(CACHE PTHREADPOOL_LIBRARY_TYPE PROPERTY STRINGS default static shared) @@ -33,12 +70,6 @@ IF(PTHREADPOOL_BUILD_TESTS) ENABLE_TESTING() ENDIF() -MACRO(PTHREADPOOL_TARGET_ENABLE_CXX11 target) - SET_TARGET_PROPERTIES(${target} PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) -ENDMACRO() - # ---[ Download deps IF(NOT DEFINED FXDIV_SOURCE_DIR) MESSAGE(STATUS "Downloading FXdiv to ${CMAKE_BINARY_DIR}/FXdiv-source (define FXDIV_SOURCE_DIR to avoid it)") @@ -132,9 +163,6 @@ ELSE() TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FASTPATH=0) ENDIF() -SET_TARGET_PROPERTIES(pthreadpool PROPERTIES - C_STANDARD 11 - C_EXTENSIONS NO) TARGET_LINK_LIBRARIES(pthreadpool PUBLIC pthreadpool_interface) TARGET_INCLUDE_DIRECTORIES(pthreadpool PRIVATE src) IF(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") @@ -177,16 +205,10 @@ IF(PTHREADPOOL_BUILD_TESTS) ENDIF() ADD_EXECUTABLE(pthreadpool-test test/pthreadpool.cc) - SET_TARGET_PROPERTIES(pthreadpool-test PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) TARGET_LINK_LIBRARIES(pthreadpool-test pthreadpool gtest gtest_main) ADD_TEST(pthreadpool pthreadpool-test) ADD_EXECUTABLE(pthreadpool-cxx-test test/pthreadpool-cxx.cc) - SET_TARGET_PROPERTIES(pthreadpool-cxx-test PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) TARGET_LINK_LIBRARIES(pthreadpool-cxx-test pthreadpool gtest gtest_main) ADD_TEST(pthreadpool-cxx pthreadpool-cxx-test) ENDIF() @@ -201,14 +223,8 @@ IF(PTHREADPOOL_BUILD_BENCHMARKS) ENDIF() ADD_EXECUTABLE(latency-bench bench/latency.cc) - SET_TARGET_PROPERTIES(latency-bench PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) TARGET_LINK_LIBRARIES(latency-bench pthreadpool benchmark) ADD_EXECUTABLE(throughput-bench bench/throughput.cc) - SET_TARGET_PROPERTIES(throughput-bench PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) TARGET_LINK_LIBRARIES(throughput-bench pthreadpool benchmark) ENDIF() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..939e534 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,28 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows [Google's Open Source Community +Guidelines](https://opensource.google.com/conduct/). diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000..2123731 --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,72 @@ +## MODULE.bazel +module( + name = "pthreadpool", +) + +# Bazel rule definitions +bazel_dep(name = "rules_cc", version = "0.1.1") +bazel_dep(name = "rules_python", version = "1.0.0") + +pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip") +pip.parse( + hub_name = "pip", + python_version = "3.11", + requirements_lock = "//:requirements_lock.txt", +) +use_repo(pip, "pip") + +# Bazel Skylib. +bazel_dep(name = "bazel_skylib", version = "1.7.1") + +# Bazel Platforms +bazel_dep(name = "platforms", version = "0.0.10") + +# TODO: some (most? all?) of the http_archive() calls below could become bazel_dep() calls, +# but it would require verifying that the semver provided by the Bazel registry matches the hash +# that we expect in CMake; it's not clear that it is a big win to do so given the modest +# complexity of our deps, so I'm leaving it like this for now to ensure that the Bazel and CMake +# builds are using identical dependencies. + +http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +# LINT.IfChange +# Google Test framework, used by most unit-tests. +http_archive( + name = "com_google_googletest", + sha256 = "648b9430fca63acc68c59ee98f624dcbcd9c24ea6b278c306ab6b7f49f62034a", + strip_prefix = "googletest-d144031940543e15423a25ae5a8a74141044862f", + urls = ["https://github.com/google/googletest/archive/d144031940543e15423a25ae5a8a74141044862f.zip"], +) +# LINT.ThenChange(cmake/DownloadGoogleTest.cmake) + +# LINT.IfChange +# Google Benchmark library, used in micro-benchmarks. +http_archive( + name = "com_google_benchmark", + sha256 = "1ba14374fddcd9623f126b1a60945e4deac4cdc4fb25a5f25e7f779e36f2db52", + strip_prefix = "benchmark-d2a8a4ee41b923876c034afb939c4fc03598e622", + urls = ["https://github.com/google/benchmark/archive/d2a8a4ee41b923876c034afb939c4fc03598e622.zip"], +) +# LINT.ThenChange(cmake/DownloadGoogleBenchmark.cmake) + +# LINT.IfChange +# FXdiv library, used for repeated integer division by the same factor +http_archive( + name = "FXdiv", + sha256 = "ab7dfb08829bee33dca38405d647868fb214ac685e379ec7ef2bebcd234cd44d", + strip_prefix = "FXdiv-b408327ac2a15ec3e43352421954f5b1967701d1", + urls = ["https://github.com/Maratyszcza/FXdiv/archive/b408327ac2a15ec3e43352421954f5b1967701d1.zip"], +) +# LINT.ThenChange(cmake/DownloadFXdiv.cmake) + +# LINT.IfChange +# cpuinfo library, used for detecting processor characteristics +http_archive( + name = "cpuinfo", + sha256 = "52e0ffd7998d8cb3a927d8a6e1145763744d866d2be09c4eccea27fc157b6bb0", + strip_prefix = "cpuinfo-cebb0933058d7f181c979afd50601dc311e1bf8c", + urls = [ + "https://github.com/pytorch/cpuinfo/archive/cebb0933058d7f181c979afd50601dc311e1bf8c.zip", + ], +) +# LINT.ThenChange(cmake/DownloadCpuinfo.cmake) diff --git a/README.md b/README.md index 57ed3d4..6888d74 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # pthreadpool -[![BSD (2 clause) License](https://img.shields.io/badge/License-BSD%202--Clause%20%22Simplified%22%20License-blue.svg)](https://github.com/Maratyszcza/pthreadpool/blob/master/LICENSE) -[![Build Status](https://img.shields.io/travis/Maratyszcza/pthreadpool.svg)](https://travis-ci.org/Maratyszcza/pthreadpool) +[![BSD (2 clause) License](https://img.shields.io/badge/License-BSD%202--Clause%20%22Simplified%22%20License-blue.svg)](https://github.com/google/pthreadpool/blob/main/LICENSE) **pthreadpool** is a portable and efficient thread pool implementation. It provides similar functionality to `#pragma omp parallel for`, but with additional features. +This is a Google-maintained fork of the original http://github.com/Maratyszcza/pthreadpool repository. + ## Features: * C interface (C++-compatible). @@ -19,7 +20,7 @@ It provides similar functionality to `#pragma omp parallel for`, but with additi ## Example - The following example demonstates using the thread pool for parallel addition of two arrays: + The following example demonstrates using the thread pool for parallel addition of two arrays: ```c static void add_arrays(struct array_addition_context* context, size_t i) { diff --git a/bench/latency.cc b/bench/latency.cc index 4fb59ee..eb0570f 100644 --- a/bench/latency.cc +++ b/bench/latency.cc @@ -1,92 +1,141 @@ -#include +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include // NOLINT +#include #include -#include - static void SetNumberOfThreads(benchmark::internal::Benchmark* benchmark) { - const int max_threads = std::thread::hardware_concurrency(); - for (int t = 1; t <= max_threads; t++) { - benchmark->Arg(t); - } + const int max_threads = std::thread::hardware_concurrency(); + for (int t = 1; t <= max_threads; t++) { + benchmark->Arg(t); + } } - -static void compute_1d(void*, size_t x) { -} +static void compute_1d(void*, size_t x) {} static void pthreadpool_parallelize_1d(benchmark::State& state) { - const uint32_t threads = static_cast(state.range(0)); - pthreadpool_t threadpool = pthreadpool_create(threads); - while (state.KeepRunning()) { - pthreadpool_parallelize_1d( - threadpool, - compute_1d, - nullptr /* context */, - threads, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d(threadpool, compute_1d, nullptr /* context */, + threads, /*flags=*/0); + } + pthreadpool_destroy(threadpool); } + BENCHMARK(pthreadpool_parallelize_1d)->UseRealTime()->Apply(SetNumberOfThreads); +static void compute_1d_tile_1d(void*, size_t, size_t) {} -static void compute_1d_tile_1d(void*, size_t, size_t) { +static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_tile_1d(threadpool, compute_1d_tile_1d, + nullptr /* context */, threads, 1, + /*flags=*/0); + } + pthreadpool_destroy(threadpool); } -static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { - const uint32_t threads = static_cast(state.range(0)); - pthreadpool_t threadpool = pthreadpool_create(threads); - while (state.KeepRunning()) { - pthreadpool_parallelize_1d_tile_1d( - threadpool, - compute_1d_tile_1d, - nullptr /* context */, - threads, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); +BENCHMARK(pthreadpool_parallelize_1d_tile_1d) + ->UseRealTime() + ->Apply(SetNumberOfThreads); + +static void compute_1d_tile_1d_dynamic(void*, size_t, size_t) {} + +static void pthreadpool_parallelize_1d_tile_1d_dynamic( + benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool, compute_1d_tile_1d_dynamic, nullptr /* context */, threads, + 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_parallelize_1d_tile_1d)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_1d_tile_1d_dynamic) + ->UseRealTime() + ->Apply(SetNumberOfThreads); -static void compute_2d(void*, size_t, size_t) { -} +static void compute_2d(void*, size_t, size_t) {} static void pthreadpool_parallelize_2d(benchmark::State& state) { - const uint32_t threads = static_cast(state.range(0)); - pthreadpool_t threadpool = pthreadpool_create(threads); - while (state.KeepRunning()) { - pthreadpool_parallelize_2d( - threadpool, - compute_2d, - nullptr /* context */, - 1, threads, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d(threadpool, compute_2d, nullptr /* context */, 1, + threads, /*flags=*/0); + } + pthreadpool_destroy(threadpool); } + BENCHMARK(pthreadpool_parallelize_2d)->UseRealTime()->Apply(SetNumberOfThreads); +static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) {} -static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) { +static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_2d(threadpool, compute_2d_tile_2d, + nullptr /* context */, 1, threads, 1, 1, + /*flags=*/0); + } + pthreadpool_destroy(threadpool); } -static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { - const uint32_t threads = static_cast(state.range(0)); - pthreadpool_t threadpool = pthreadpool_create(threads); - while (state.KeepRunning()) { - pthreadpool_parallelize_2d_tile_2d( - threadpool, - compute_2d_tile_2d, - nullptr /* context */, - 1, threads, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); +BENCHMARK(pthreadpool_parallelize_2d_tile_2d) + ->UseRealTime() + ->Apply(SetNumberOfThreads); + +static void compute_2d_tile_2d_dynamic(void*, size_t, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_tile_2d_dynamic( + benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool, compute_2d_tile_2d_dynamic, nullptr /* context */, 1, + threads, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); +} + +BENCHMARK(pthreadpool_parallelize_2d_tile_2d_dynamic) + ->UseRealTime() + ->Apply(SetNumberOfThreads); + +static void compute_3d_tile_2d_dynamic(void*, size_t, size_t, size_t, size_t, + size_t) {} + +static void pthreadpool_parallelize_3d_tile_2d_dynamic( + benchmark::State& state) { + const uint32_t threads = static_cast(state.range(0)); + pthreadpool_t threadpool = pthreadpool_create(threads); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool, compute_3d_tile_2d_dynamic, nullptr /* context */, 1, 1, + threads, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_parallelize_2d_tile_2d)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_3d_tile_2d_dynamic) + ->UseRealTime() + ->Apply(SetNumberOfThreads); BENCHMARK_MAIN(); diff --git a/bench/throughput.cc b/bench/throughput.cc index 47c8da7..fc60c94 100644 --- a/bench/throughput.cc +++ b/bench/throughput.cc @@ -1,407 +1,461 @@ -#include +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include #include - -static void compute_1d(void*, size_t) { -} +static void compute_1d(void*, size_t) {} static void pthreadpool_parallelize_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_1d( - threadpool, - compute_1d, - nullptr /* context */, - items * threads, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d(threadpool, compute_1d, nullptr /* context */, + items * threads, /*flags=*/0); + } + pthreadpool_destroy(threadpool); - -static void compute_1d_tile_1d(void*, size_t, size_t) { -} - -static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_1d_tile_1d( - threadpool, - compute_1d_tile_1d, - nullptr /* context */, - items * threads, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); } -BENCHMARK(pthreadpool_parallelize_1d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); +static void compute_1d_tile_1d(void*, size_t, size_t) {} -static void compute_2d(void*, size_t, size_t) { -} +static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_tile_1d(threadpool, compute_1d_tile_1d, + nullptr /* context */, items * threads, + 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_1d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_1d_tile_1d_dynamic(void*, size_t, size_t) {} + +static void pthreadpool_parallelize_1d_tile_1d_dynamic( + benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool, compute_1d_tile_1d_dynamic, nullptr /* context */, + items * threads, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_1d_tile_1d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d(void*, size_t, size_t) {} static void pthreadpool_parallelize_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_2d( - threadpool, - compute_2d, - nullptr /* context */, - threads, items, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d(threadpool, compute_2d, nullptr /* context */, + threads, items, /*flags=*/0); + } + pthreadpool_destroy(threadpool); -static void compute_2d_tile_1d(void*, size_t, size_t, size_t) { + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); } +BENCHMARK(pthreadpool_parallelize_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); -static void pthreadpool_parallelize_2d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_2d_tile_1d( - threadpool, - compute_2d_tile_1d, - nullptr /* context */, - threads, items, - 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_2d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +static void compute_2d_tile_1d(void*, size_t, size_t, size_t) {} - -static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) { -} +static void pthreadpool_parallelize_2d_tile_1d(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_1d(threadpool, compute_2d_tile_1d, + nullptr /* context */, threads, items, 1, + /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d_tile_1d_dynamic(void*, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_tile_1d_dynamic( + benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool, compute_2d_tile_1d_dynamic, nullptr /* context */, threads, + items, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_tile_1d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) {} static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_2d_tile_2d( - threadpool, - compute_2d_tile_2d, - nullptr /* context */, - threads, items, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_2d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_3d(void*, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_2d(threadpool, compute_2d_tile_2d, + nullptr /* context */, threads, items, 1, + 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_tile_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_2d_tile_2d_dynamic(void*, size_t, size_t, size_t, size_t) {} + +static void pthreadpool_parallelize_2d_tile_2d_dynamic( + benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool, compute_2d_tile_2d_dynamic, nullptr /* context */, threads, + items, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_2d_tile_2d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_3d(void*, size_t, size_t, size_t) {} static void pthreadpool_parallelize_3d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_3d( - threadpool, - compute_3d, - nullptr /* context */, - 1, threads, items, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_3d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_3d_tile_1d(void*, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d(threadpool, compute_3d, nullptr /* context */, 1, + threads, items, /*flags=*/0); + } + pthreadpool_destroy(threadpool); -static void pthreadpool_parallelize_3d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_3d_tile_1d( - threadpool, - compute_3d_tile_1d, - nullptr /* context */, - 1, threads, items, - 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); } -BENCHMARK(pthreadpool_parallelize_3d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_3d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); +static void compute_3d_tile_1d(void*, size_t, size_t, size_t, size_t) {} -static void compute_3d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t) { -} +static void pthreadpool_parallelize_3d_tile_1d(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d_tile_1d(threadpool, compute_3d_tile_1d, + nullptr /* context */, 1, threads, items, + 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_3d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_3d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t) {} static void pthreadpool_parallelize_3d_tile_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_3d_tile_2d( - threadpool, - compute_3d_tile_2d, - nullptr /* context */, - 1, threads, items, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_3d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_4d(void*, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d_tile_2d(threadpool, compute_3d_tile_2d, + nullptr /* context */, 1, threads, items, + 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_3d_tile_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_3d_tile_2d_dynamic(void*, size_t, size_t, size_t, size_t, + size_t) {} + +static void pthreadpool_parallelize_3d_tile_2d_dynamic( + benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool, compute_3d_tile_2d_dynamic, nullptr /* context */, 1, + threads, items, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_3d_tile_2d_dynamic) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_4d(void*, size_t, size_t, size_t, size_t) {} static void pthreadpool_parallelize_4d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_4d( - threadpool, - compute_4d, - nullptr /* context */, - 1, 1, threads, items, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_4d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_4d(threadpool, compute_4d, nullptr /* context */, 1, + 1, threads, items, /*flags=*/0); + } + pthreadpool_destroy(threadpool); -static void compute_4d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t) { -} - -static void pthreadpool_parallelize_4d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_4d_tile_1d( - threadpool, - compute_4d_tile_1d, - nullptr /* context */, - 1, 1, threads, items, - 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); } -BENCHMARK(pthreadpool_parallelize_4d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_4d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); +static void compute_4d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t) {} -static void compute_4d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void pthreadpool_parallelize_4d_tile_1d(benchmark::State& state) { + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_4d_tile_1d(threadpool, compute_4d_tile_1d, + nullptr /* context */, 1, 1, threads, + items, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_4d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_4d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, + size_t) {} static void pthreadpool_parallelize_4d_tile_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_4d_tile_2d( - threadpool, - compute_4d_tile_2d, - nullptr /* context */, - 1, 1, threads, items, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_4d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_5d(void*, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_4d_tile_2d(threadpool, compute_4d_tile_2d, + nullptr /* context */, 1, 1, threads, + items, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_4d_tile_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_5d(void*, size_t, size_t, size_t, size_t, size_t) {} static void pthreadpool_parallelize_5d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_5d( - threadpool, - compute_5d, - nullptr /* context */, - 1, 1, 1, threads, items, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_5d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_5d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_5d(threadpool, compute_5d, nullptr /* context */, 1, + 1, 1, threads, items, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_5d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_5d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t, + size_t) {} static void pthreadpool_parallelize_5d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_5d_tile_1d( - threadpool, - compute_5d_tile_1d, - nullptr /* context */, - 1, 1, 1, threads, items, - 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_5d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_5d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_5d_tile_1d(threadpool, compute_5d_tile_1d, + nullptr /* context */, 1, 1, 1, threads, + items, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_5d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_5d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, + size_t, size_t) {} static void pthreadpool_parallelize_5d_tile_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_5d_tile_2d( - threadpool, - compute_5d_tile_2d, - nullptr /* context */, - 1, 1, 1, threads, items, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_5d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_6d(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_5d_tile_2d(threadpool, compute_5d_tile_2d, + nullptr /* context */, 1, 1, 1, threads, + items, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_5d_tile_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_6d(void*, size_t, size_t, size_t, size_t, size_t, size_t) {} static void pthreadpool_parallelize_6d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_6d( - threadpool, - compute_6d, - nullptr /* context */, - 1, 1, 1, 1, threads, items, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_6d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_6d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_6d(threadpool, compute_6d, nullptr /* context */, 1, + 1, 1, 1, threads, items, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_6d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_6d_tile_1d(void*, size_t, size_t, size_t, size_t, size_t, + size_t, size_t) {} static void pthreadpool_parallelize_6d_tile_1d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_6d_tile_1d( - threadpool, - compute_6d_tile_1d, - nullptr /* context */, - 1, 1, 1, 1, threads, items, - 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_6d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - - -static void compute_6d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_6d_tile_1d(threadpool, compute_6d_tile_1d, + nullptr /* context */, 1, 1, 1, 1, + threads, items, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_6d_tile_1d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); + +static void compute_6d_tile_2d(void*, size_t, size_t, size_t, size_t, size_t, + size_t, size_t, size_t) {} static void pthreadpool_parallelize_6d_tile_2d(benchmark::State& state) { - pthreadpool_t threadpool = pthreadpool_create(2); - const size_t threads = pthreadpool_get_threads_count(threadpool); - const size_t items = static_cast(state.range(0)); - while (state.KeepRunning()) { - pthreadpool_parallelize_6d_tile_2d( - threadpool, - compute_6d_tile_2d, - nullptr /* context */, - 1, 1, 1, 1, threads, items, - 1, 1, - 0 /* flags */); - } - pthreadpool_destroy(threadpool); - - /* Do not normalize by thread */ - state.SetItemsProcessed(int64_t(state.iterations()) * items); -} -BENCHMARK(pthreadpool_parallelize_6d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); - + pthreadpool_t threadpool = pthreadpool_create(2); + const size_t threads = pthreadpool_get_threads_count(threadpool); + const size_t items = static_cast(state.range(0)); + while (state.KeepRunning()) { + pthreadpool_parallelize_6d_tile_2d(threadpool, compute_6d_tile_2d, + nullptr /* context */, 1, 1, 1, 1, + threads, items, 1, 1, /*flags=*/0); + } + pthreadpool_destroy(threadpool); + + /* Do not normalize by thread */ + state.SetItemsProcessed(static_cast(state.iterations()) * items); +} +BENCHMARK(pthreadpool_parallelize_6d_tile_2d) + ->UseRealTime() + ->RangeMultiplier(10) + ->Range(10, 1000000); BENCHMARK_MAIN(); diff --git a/cmake/DownloadCpuinfo.cmake b/cmake/DownloadCpuinfo.cmake index e6f2893..9866a75 100644 --- a/cmake/DownloadCpuinfo.cmake +++ b/cmake/DownloadCpuinfo.cmake @@ -1,3 +1,12 @@ +# Copyright (c) 2017 Facebook Inc. +# Copyright (c) 2015-2017 Georgia Institute of Technology +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) PROJECT(cpuinfo-download NONE) diff --git a/cmake/DownloadFXdiv.cmake b/cmake/DownloadFXdiv.cmake index cbda7d0..889505a 100644 --- a/cmake/DownloadFXdiv.cmake +++ b/cmake/DownloadFXdiv.cmake @@ -1,3 +1,12 @@ +# Copyright (c) 2017 Facebook Inc. +# Copyright (c) 2015-2017 Georgia Institute of Technology +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) PROJECT(fxdiv-download NONE) diff --git a/cmake/DownloadGoogleBenchmark.cmake b/cmake/DownloadGoogleBenchmark.cmake index bae6b0e..331d406 100644 --- a/cmake/DownloadGoogleBenchmark.cmake +++ b/cmake/DownloadGoogleBenchmark.cmake @@ -1,3 +1,12 @@ +# Copyright (c) 2017 Facebook Inc. +# Copyright (c) 2015-2017 Georgia Institute of Technology +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) PROJECT(googlebenchmark-download NONE) diff --git a/cmake/DownloadGoogleTest.cmake b/cmake/DownloadGoogleTest.cmake index d2d9ce4..9075934 100644 --- a/cmake/DownloadGoogleTest.cmake +++ b/cmake/DownloadGoogleTest.cmake @@ -1,3 +1,12 @@ +# Copyright (c) 2017 Facebook Inc. +# Copyright (c) 2015-2017 Georgia Institute of Technology +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR) PROJECT(googletest-download NONE) diff --git a/cmake/x64_arm64.toolchain b/cmake/x64_arm64.toolchain new file mode 100644 index 0000000..ff21b6a --- /dev/null +++ b/cmake/x64_arm64.toolchain @@ -0,0 +1,8 @@ +# Copyright 2024 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +SET(CMAKE_SYSTEM_NAME Windows) +SET(CMAKE_SYSTEM_PROCESSOR arm64) +SET(CMAKE_CROSSCOMPILING TRUE) diff --git a/examples/addition.c b/examples/addition.c index de806df..b36844a 100644 --- a/examples/addition.c +++ b/examples/addition.c @@ -1,48 +1,55 @@ -#include -#include +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + #include +#include +#include #include struct array_addition_context { - double *augend; - double *addend; - double *sum; + double *augend; + double *addend; + double *sum; }; -static void add_arrays(struct array_addition_context* context, size_t i) { - context->sum[i] = context->augend[i] + context->addend[i]; +static void add_arrays(struct array_addition_context *context, size_t i) { + context->sum[i] = context->augend[i] + context->addend[i]; } #define ARRAY_SIZE 4 int main() { - double augend[ARRAY_SIZE] = { 1.0, 2.0, 4.0, -5.0 }; - double addend[ARRAY_SIZE] = { 0.25, -1.75, 0.0, 0.5 }; - double sum[ARRAY_SIZE]; - - pthreadpool_t threadpool = pthreadpool_create(0); - assert(threadpool != NULL); - - const size_t threads_count = pthreadpool_get_threads_count(threadpool); - printf("Created thread pool with %zu threads\n", threads_count); - - struct array_addition_context context = { augend, addend, sum }; - pthreadpool_parallelize_1d(threadpool, - (pthreadpool_task_1d_t) add_arrays, - (void**) &context, - ARRAY_SIZE, - PTHREADPOOL_FLAG_DISABLE_DENORMALS /* flags */); - - pthreadpool_destroy(threadpool); - threadpool = NULL; - - printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Augend", - augend[0], augend[1], augend[2], augend[3]); - printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Addend", - addend[0], addend[1], addend[2], addend[3]); - printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Sum", - sum[0], sum[1], sum[2], sum[3]); - - return 0; + double augend[ARRAY_SIZE] = {1.0, 2.0, 4.0, -5.0}; + double addend[ARRAY_SIZE] = {0.25, -1.75, 0.0, 0.5}; + double sum[ARRAY_SIZE]; + + pthreadpool_t threadpool = pthreadpool_create(0); + assert(threadpool != NULL); + + const size_t threads_count = pthreadpool_get_threads_count(threadpool); + printf("Created thread pool with %zu threads\n", threads_count); + + struct array_addition_context context = {augend, addend, sum}; + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t)add_arrays, + (void **)&context, ARRAY_SIZE, + PTHREADPOOL_FLAG_DISABLE_DENORMALS /* flags */); + + pthreadpool_destroy(threadpool); + threadpool = NULL; + + printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Augend", augend[0], augend[1], + augend[2], augend[3]); + printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Addend", addend[0], addend[1], + addend[2], addend[3]); + printf("%8s\t%.2lf\t%.2lf\t%.2lf\t%.2lf\n", "Sum", sum[0], sum[1], sum[2], + sum[3]); + + return 0; } diff --git a/include/pthreadpool.h b/include/pthreadpool.h index 953ccc4..e84c009 100644 --- a/include/pthreadpool.h +++ b/include/pthreadpool.h @@ -1,5 +1,14 @@ -#ifndef PTHREADPOOL_H_ -#define PTHREADPOOL_H_ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_ +#define __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_ #include #include @@ -9,34 +18,78 @@ typedef struct pthreadpool* pthreadpool_t; typedef void (*pthreadpool_task_1d_t)(void*, size_t); typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); +typedef void (*pthreadpool_task_1d_tile_1d_dynamic_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); -typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t); +typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, + size_t); typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); -typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_task_2d_tile_2d_dynamic_t)(void*, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_1d_dynamic_t)(void*, size_t, size_t, + size_t); typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, + size_t, size_t, + size_t); +typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_2d_dynamic_t)(void*, size_t, size_t, + size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t); +typedef void (*pthreadpool_task_4d_tile_2d_dynamic_t)(void*, size_t, size_t, + size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t); +typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t, size_t, + size_t); typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t); -typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t); - -typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); - +typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, + size_t, size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_2d_dynamic_with_id_t)(void*, uint32_t, + size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, + size_t, size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_task_3d_tile_2d_dynamic_with_id_t)(void*, uint32_t, + size_t, size_t, + size_t, size_t, + size_t); +typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t, size_t); +typedef void (*pthreadpool_task_4d_tile_2d_dynamic_with_id_t)(void*, uint32_t, + size_t, size_t, + size_t, size_t, + size_t, size_t); + +typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)( + void*, uint32_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)( + void*, uint32_t, size_t, size_t, size_t, size_t, size_t); /** * Disable support for denormalized numbers to the maximum extent possible for @@ -116,12 +169,9 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_1d( - pthreadpool_t threadpool, - pthreadpool_task_1d_t function, - void* context, - size_t range, - uint32_t flags); +void pthreadpool_parallelize_1d(pthreadpool_t threadpool, + pthreadpool_task_1d_t function, void* context, + size_t range, uint32_t flags); /** * Process items on a 1D grid passing along the current thread id. @@ -147,11 +197,8 @@ void pthreadpool_parallelize_1d( * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_1d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_1d_with_thread_t function, - void* context, - size_t range, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_1d_with_thread_t function, + void* context, size_t range, uint32_t flags); /** * Process items on a 1D grid using a microarchitecture-aware task function. @@ -191,13 +238,9 @@ void pthreadpool_parallelize_1d_with_thread( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_1d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range, uint32_t flags); /** * Process items on a 1D grid with specified maximum tile size. @@ -223,13 +266,44 @@ void pthreadpool_parallelize_1d_with_uarch( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_1d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_1d_tile_1d_t function, - void* context, - size_t range, - size_t tile, - uint32_t flags); +void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_1d_tile_1d_t function, + void* context, size_t range, + size_t tile, uint32_t flags); + +/** + * Process items on a 1D grid with specified prefered tile size. + * + * The function repeatedly calls + * + * function(context, i, count) + * + * in parallel where `i` is in the range `[0, range)` and a multiple of the + * provided @a tile and `count` is an integer multiple of @a tile unless `i + * + count == range`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range the number of items on the 1D grid to process. + * @param tile the preferred multiple number of items on the 1D grid to + * process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_1d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function, + void* context, size_t range, size_t tile, uint32_t flags); /** * Process items on a 2D grid. @@ -257,13 +331,9 @@ void pthreadpool_parallelize_1d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_2d( - pthreadpool_t threadpool, - pthreadpool_task_2d_t function, - void* context, - size_t range_i, - size_t range_j, - uint32_t flags); +void pthreadpool_parallelize_2d(pthreadpool_t threadpool, + pthreadpool_task_2d_t function, void* context, + size_t range_i, size_t range_j, uint32_t flags); /** * Process items on a 2D grid passing along the current thread id. @@ -292,12 +362,8 @@ void pthreadpool_parallelize_2d( * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_2d_with_thread_t function, - void* context, - size_t range_i, - size_t range_j, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_2d_with_thread_t function, + void* context, size_t range_i, size_t range_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along the @@ -328,14 +394,50 @@ void pthreadpool_parallelize_2d_with_thread( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_2d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags); +void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_t function, + void* context, size_t range_i, + size_t range_j, size_t tile_j, + uint32_t flags); + +/** + * Process items on a 2D grid with specified prefered tile size along the + * last grid dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_j) + * + * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range + * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an + * integer multiple of @a tile_j unless `j + count_j == range_j`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along the @@ -378,15 +480,9 @@ void pthreadpool_parallelize_2d_tile_1d( * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along the @@ -400,7 +496,8 @@ void pthreadpool_parallelize_2d_tile_1d_with_uarch( * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j += tile_j) - * function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j)); + * function(context, uarch_index, thread_index, i, j, min(range_j - j, + * tile_j)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. @@ -430,15 +527,10 @@ void pthreadpool_parallelize_2d_tile_1d_with_uarch( * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags); + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along each @@ -472,15 +564,116 @@ void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_2d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags); +void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool, + pthreadpool_task_2d_tile_2d_t function, + void* context, size_t range_i, + size_t range_j, size_t tile_i, + size_t tile_j, uint32_t flags); + +/** + * Process items on a 2D grid with specified prefered tile size along each grid + * dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_i, count_j) + * + * in parallel where `i` is in the range `[0, range_i)` and a multiple of the + * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the + * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a + * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j == + * range_j`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling + * thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_i the preferred multiple number of items on the first + * dimension of the 2D grid to process in each function call. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags); + +/** + * Process items on a 2D grid with specified prefered tile size along each grid + * dimension using a microarchitecture-aware task function. + * + * The function repeatedly calls + * + * function(context, uarch_index, i, j, count_i, count_j) + * + * in parallel where `i` is in the range `[0, range_i)` and a multiple of the + * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the + * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a + * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j == + * range_j`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If + * threadpool is NULL, all items are processed + * serially on the calling thread. + * @param function the function to call for each interval of the + * given range. + * @param context the first argument passed to the specified + * function. + * @param default_uarch_index the microarchitecture index to use when + * pthreadpool is configured without cpuinfo, + * cpuinfo initialization failed, or index returned + * by cpuinfo_get_current_uarch_index() exceeds + * the max_uarch_index value. + * @param max_uarch_index the maximum microarchitecture index expected + * by the specified function. If the index returned + * by cpuinfo_get_current_uarch_index() exceeds this + * value, default_uarch_index will be used instead. + * default_uarch_index can exceed max_uarch_index. + * @param range_i the number of items on the first dimension of the + * 2D grid to process. + * @param range_j the number of items on the second dimension of + * the 2D grid to process. + * @param tile_i the preferred multiple number of items on the + * first dimension of the 2D grid to process in each + * function call. + * @param tile_j the preferred multiple number of items on the + * second dimension of the 2D grid to process in + * each function call. + * @param flags a bitwise combination of zero or more optional + * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along each @@ -531,16 +724,10 @@ void pthreadpool_parallelize_2d_tile_2d( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags); /** * Process items on a 3D grid. @@ -571,14 +758,10 @@ void pthreadpool_parallelize_2d_tile_2d_with_uarch( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_3d( - pthreadpool_t threadpool, - pthreadpool_task_3d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - uint32_t flags); +void pthreadpool_parallelize_3d(pthreadpool_t threadpool, + pthreadpool_task_3d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, + uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -612,15 +795,11 @@ void pthreadpool_parallelize_3d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_3d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags); +void pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t tile_k, uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -655,14 +834,10 @@ void pthreadpool_parallelize_3d_tile_1d( * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_1d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_thread_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags); + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_thread_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -711,16 +886,10 @@ void pthreadpool_parallelize_3d_tile_1d_with_thread( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -735,7 +904,8 @@ void pthreadpool_parallelize_3d_tile_1d_with_uarch( * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k += tile_k) - * function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k)); + * function(context, uarch_index, thread_index, i, j, k, min(range_k - + * k, tile_k)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. @@ -770,16 +940,10 @@ void pthreadpool_parallelize_3d_tile_1d_with_uarch( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags); + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t tile_k, uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -816,16 +980,128 @@ void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_3d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags); +void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool, + pthreadpool_task_3d_tile_2d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, + uint32_t flags); + +/** + * Process items on a 3D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, count_j, count_k) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)` and a multiple of the provided @a + * tile_j, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k, + * unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling + * thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 3D + * grid to process. + * @param range_j the number of items on the second dimension of the 3D + * grid to process. + * @param range_k the number of items on the third dimension of the 3D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 3D grid to process in each function call. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 3D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags); + +/** + * Process items on a 3D grid with specified prefered tile size along the last + * two grid dimensions using a microarchitecture-aware task function. + * + * The function repeatedly calls + * + * function(context, uarch_index, i, j, k, count_j, count_k) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)` and a multiple of the provided @a + * tile_j, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k, + * unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If + * threadpool is NULL, all items are processed + * serially on the calling thread. + * @param function the function to call for each interval of the + * given range. + * @param context the first argument passed to the specified + * function. + * @param default_uarch_index the microarchitecture index to use when + * pthreadpool is configured without cpuinfo, + * cpuinfo initialization failed, or index returned + * by cpuinfo_get_current_uarch_index() exceeds + * the max_uarch_index value. + * @param max_uarch_index the maximum microarchitecture index expected + * by the specified function. If the index returned + * by cpuinfo_get_current_uarch_index() exceeds this + * value, default_uarch_index will be used instead. + * default_uarch_index can exceed max_uarch_index. + * @param range_i the number of items on the first dimension of the + * 3D grid to process. + * @param range_j the number of items on the second dimension of + * the 3D grid to process. + * @param range_k the number of items on the third dimension of the + * 3D grid to process. + * @param tile_j the preferred multiple number of items on the + * second dimension of the 3D grid to process in + * each function call. + * @param tile_k the preferred multiple number of items on the + * third dimension of the 3D grid to process in each + * function call. + * @param flags a bitwise combination of zero or more optional + * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t tile_j, size_t tile_k, + uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the @@ -877,17 +1153,10 @@ void pthreadpool_parallelize_3d_tile_2d( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_j, + size_t tile_k, uint32_t flags); /** * Process items on a 4D grid. @@ -921,15 +1190,10 @@ void pthreadpool_parallelize_3d_tile_2d_with_uarch( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_4d( - pthreadpool_t threadpool, - pthreadpool_task_4d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - uint32_t flags); +void pthreadpool_parallelize_4d(pthreadpool_t threadpool, + pthreadpool_task_4d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, + size_t range_l, uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the @@ -966,16 +1230,12 @@ void pthreadpool_parallelize_4d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_4d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_1d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_l, - uint32_t flags); +void pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_4d_tile_1d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t tile_l, + uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the @@ -1015,17 +1275,12 @@ void pthreadpool_parallelize_4d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_4d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags); +void pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool, + pthreadpool_task_4d_tile_2d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, + size_t tile_l, uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the @@ -1080,18 +1335,132 @@ void pthreadpool_parallelize_4d_tile_2d( * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_4d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_with_id_t function, - void* context, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags); + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t range_l, + size_t tile_k, size_t tile_l, uint32_t flags); + +/** + * Process items on a 4D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, l, count_k, count_l) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)`, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `l` is in the range `[0, range_l)` and a multiple of the provided @a + * tile_l, + * - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l, + * unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling + * thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 4D + * grid to process. + * @param range_j the number of items on the second dimension of the 4D + * grid to process. + * @param range_k the number of items on the third dimension of the 4D + * grid to process. + * @param range_l the number of items on the fourth dimension of the 4D + * grid to process. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 4D grid to process in each function call. + * @param tile_l the preferred multiple number of items on the fourth + * dimension of the 4D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_4d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags); + +/** + * Process items on a 4D grid with specified prefered tile size along the last + * two grid dimensions using a microarchitecture-aware task function. + * + * The function repeatedly calls + * + * function(context, uarch_index, i, j, k, l, count_k, count_l) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)`, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `l` is in the range `[0, range_l)` and a multiple of the provided @a + * tile_l, + * - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l, + * unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If + * threadpool is NULL, all items are processed + * serially on the calling thread. + * @param function the function to call for each interval of the + * given range. + * @param context the first argument passed to the specified + * function. + * @param default_uarch_index the microarchitecture index to use when + * pthreadpool is configured without cpuinfo, + * cpuinfo initialization failed, or index returned + * by cpuinfo_get_current_uarch_index() exceeds + * the max_uarch_index value. + * @param max_uarch_index the maximum microarchitecture index expected + * by the specified function. If the index returned + * by cpuinfo_get_current_uarch_index() exceeds this + * value, default_uarch_index will be used instead. + * default_uarch_index can exceed max_uarch_index. + * @param range_i the number of items on the first dimension of the + * 4D grid to process. + * @param range_j the number of items on the second dimension of + * the 4D grid to process. + * @param range_k the number of items on the third dimension of the + * 4D grid to process. + * @param range_l the number of items on the fourth dimension of + * the 4D grid to process. + * @param tile_k the preferred multiple number of items on the + * third dimension of the 4D grid to process in each + * function call. + * @param tile_l the preferred multiple number of items on the + * fourth dimension of the 4D grid to process in + * each function call. + * @param flags a bitwise combination of zero or more optional + * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +void pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_4d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t range_l, size_t tile_k, + size_t tile_l, uint32_t flags); /** * Process items on a 5D grid. @@ -1128,16 +1497,10 @@ void pthreadpool_parallelize_4d_tile_2d_with_uarch( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_5d( - pthreadpool_t threadpool, - pthreadpool_task_5d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - uint32_t flags); +void pthreadpool_parallelize_5d(pthreadpool_t threadpool, + pthreadpool_task_5d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, uint32_t flags); /** * Process items on a 5D grid with the specified maximum tile size along the @@ -1177,17 +1540,12 @@ void pthreadpool_parallelize_5d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_5d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_1d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_m, - uint32_t flags); +void pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_5d_tile_1d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t tile_m, uint32_t flags); /** * Process items on a 5D grid with the specified maximum tile size along the @@ -1230,18 +1588,13 @@ void pthreadpool_parallelize_5d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_5d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_2d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_l, - size_t tile_m, - uint32_t flags); +void pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool, + pthreadpool_task_5d_tile_2d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t tile_l, size_t tile_m, + uint32_t flags); /** * Process items on a 6D grid. @@ -1283,17 +1636,11 @@ void pthreadpool_parallelize_5d_tile_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_6d( - pthreadpool_t threadpool, - pthreadpool_task_6d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - uint32_t flags); +void pthreadpool_parallelize_6d(pthreadpool_t threadpool, + pthreadpool_task_6d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t range_n, + uint32_t flags); /** * Process items on a 6D grid with the specified maximum tile size along the @@ -1336,18 +1683,13 @@ void pthreadpool_parallelize_6d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_6d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_1d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_n, - uint32_t flags); +void pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool, + pthreadpool_task_6d_tile_1d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t range_n, size_t tile_n, + uint32_t flags); /** * Process items on a 6D grid with the specified maximum tile size along the @@ -1393,19 +1735,13 @@ void pthreadpool_parallelize_6d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -void pthreadpool_parallelize_6d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_2d_t function, - void* context, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_m, - size_t tile_n, - uint32_t flags); +void pthreadpool_parallelize_6d_tile_2d(pthreadpool_t threadpool, + pthreadpool_task_6d_tile_2d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t range_n, size_t tile_m, + size_t tile_n, uint32_t flags); /** * Terminates threads in the thread pool and releases associated resources. @@ -1421,70 +1757,56 @@ void pthreadpool_destroy(pthreadpool_t threadpool); /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */ #if defined(__GNUC__) - #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) +#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) #else - #define PTHREADPOOL_DEPRECATED +#define PTHREADPOOL_DEPRECATED #endif typedef void (*pthreadpool_function_1d_t)(void*, size_t); typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t); -typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); - -void pthreadpool_compute_1d( - pthreadpool_t threadpool, - pthreadpool_function_1d_t function, - void* argument, - size_t range) PTHREADPOOL_DEPRECATED; - -void pthreadpool_compute_1d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_1d_tiled_t function, - void* argument, - size_t range, - size_t tile) PTHREADPOOL_DEPRECATED; - -void pthreadpool_compute_2d( - pthreadpool_t threadpool, - pthreadpool_function_2d_t function, - void* argument, - size_t range_i, - size_t range_j) PTHREADPOOL_DEPRECATED; - -void pthreadpool_compute_2d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_2d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j) PTHREADPOOL_DEPRECATED; - -void pthreadpool_compute_3d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_3d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_i, - size_t tile_j, - size_t tile_k) PTHREADPOOL_DEPRECATED; - -void pthreadpool_compute_4d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_4d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_i, - size_t tile_j, - size_t tile_k, - size_t tile_l) PTHREADPOOL_DEPRECATED; +typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, + size_t); +typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t); +typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, + size_t, size_t, size_t, size_t, + size_t); + +void pthreadpool_compute_1d(pthreadpool_t threadpool, + pthreadpool_function_1d_t function, void* argument, + size_t range) PTHREADPOOL_DEPRECATED; + +void pthreadpool_compute_1d_tiled(pthreadpool_t threadpool, + pthreadpool_function_1d_tiled_t function, + void* argument, size_t range, + size_t tile) PTHREADPOOL_DEPRECATED; + +void pthreadpool_compute_2d(pthreadpool_t threadpool, + pthreadpool_function_2d_t function, void* argument, + size_t range_i, + size_t range_j) PTHREADPOOL_DEPRECATED; + +void pthreadpool_compute_2d_tiled(pthreadpool_t threadpool, + pthreadpool_function_2d_tiled_t function, + void* argument, size_t range_i, + size_t range_j, size_t tile_i, + size_t tile_j) PTHREADPOOL_DEPRECATED; + +void pthreadpool_compute_3d_tiled(pthreadpool_t threadpool, + pthreadpool_function_3d_tiled_t function, + void* argument, size_t range_i, + size_t range_j, size_t range_k, size_t tile_i, + size_t tile_j, + size_t tile_k) PTHREADPOOL_DEPRECATED; + +void pthreadpool_compute_4d_tiled(pthreadpool_t threadpool, + pthreadpool_function_4d_tiled_t function, + void* argument, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t tile_i, size_t tile_j, + size_t tile_k, + size_t tile_l) PTHREADPOOL_DEPRECATED; #endif /* PTHREADPOOL_NO_DEPRECATED_API */ @@ -1496,125 +1818,144 @@ void pthreadpool_compute_4d_tiled( namespace libpthreadpool { namespace detail { -namespace { +namespace { // NOLINT: Naming this namespace would expose it. -template +template void call_wrapper_1d(void* arg, size_t i) { - (*static_cast(arg))(i); + (*static_cast(arg))(i); } -template +template void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) { - (*static_cast(arg))(range_i, tile_i); + (*static_cast(arg))(range_i, tile_i); } -template +template +void call_wrapper_1d_tile_1d_dynamic(void* arg, size_t range_i, size_t tile_i) { + (*static_cast(arg))(range_i, tile_i); +} + +template void call_wrapper_2d(void* functor, size_t i, size_t j) { - (*static_cast(functor))(i, j); + (*static_cast(functor))(i, j); +} + +template +void call_wrapper_2d_tile_1d(void* functor, size_t i, size_t range_j, + size_t tile_j) { + (*static_cast(functor))(i, range_j, tile_j); } -template -void call_wrapper_2d_tile_1d(void* functor, - size_t i, size_t range_j, size_t tile_j) -{ - (*static_cast(functor))(i, range_j, tile_j); +template +void call_wrapper_2d_tile_1d_dynamic(void* functor, size_t i, size_t range_j, + size_t tile_j) { + (*static_cast(functor))(i, range_j, tile_j); } -template -void call_wrapper_2d_tile_2d(void* functor, - size_t range_i, size_t range_j, - size_t tile_i, size_t tile_j) -{ - (*static_cast(functor))(range_i, range_j, tile_i, tile_j); +template +void call_wrapper_2d_tile_2d(void* functor, size_t range_i, size_t range_j, + size_t tile_i, size_t tile_j) { + (*static_cast(functor))(range_i, range_j, tile_i, tile_j); } -template +template +void call_wrapper_2d_tile_2d_dynamic(void* functor, size_t range_i, + size_t range_j, size_t tile_i, + size_t tile_j) { + (*static_cast(functor))(range_i, range_j, tile_i, tile_j); +} + +template void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) { - (*static_cast(functor))(i, j, k); + (*static_cast(functor))(i, j, k); } -template -void call_wrapper_3d_tile_1d(void* functor, - size_t i, size_t j, size_t range_k, - size_t tile_k) -{ - (*static_cast(functor))(i, j, range_k, tile_k); +template +void call_wrapper_3d_tile_1d(void* functor, size_t i, size_t j, size_t range_k, + size_t tile_k) { + (*static_cast(functor))(i, j, range_k, tile_k); } -template -void call_wrapper_3d_tile_2d(void* functor, - size_t i, size_t range_j, size_t range_k, - size_t tile_j, size_t tile_k) -{ - (*static_cast(functor))(i, range_j, range_k, tile_j, tile_k); +template +void call_wrapper_3d_tile_2d(void* functor, size_t i, size_t range_j, + size_t range_k, size_t tile_j, size_t tile_k) { + (*static_cast(functor))(i, range_j, range_k, tile_j, tile_k); } -template +template +void call_wrapper_3d_tile_2d_dynamic(void* functor, size_t i, size_t range_j, + size_t range_k, size_t tile_j, + size_t tile_k) { + (*static_cast(functor))(i, range_j, range_k, tile_j, tile_k); +} + +template void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) { - (*static_cast(functor))(i, j, k, l); + (*static_cast(functor))(i, j, k, l); } -template -void call_wrapper_4d_tile_1d(void* functor, - size_t i, size_t j, size_t k, size_t range_l, - size_t tile_l) -{ - (*static_cast(functor))(i, j, k, range_l, tile_l); +template +void call_wrapper_4d_tile_1d(void* functor, size_t i, size_t j, size_t k, + size_t range_l, size_t tile_l) { + (*static_cast(functor))(i, j, k, range_l, tile_l); } -template -void call_wrapper_4d_tile_2d(void* functor, - size_t i, size_t j, size_t range_k, size_t range_l, - size_t tile_k, size_t tile_l) -{ - (*static_cast(functor))(i, j, range_k, range_l, tile_k, tile_l); +template +void call_wrapper_4d_tile_2d(void* functor, size_t i, size_t j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l) { + (*static_cast(functor))(i, j, range_k, range_l, tile_k, tile_l); } -template -void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) { - (*static_cast(functor))(i, j, k, l, m); +template +void call_wrapper_4d_tile_2d_dynamic(void* functor, size_t i, size_t j, + size_t range_k, size_t range_l, + size_t tile_k, size_t tile_l) { + (*static_cast(functor))(i, j, range_k, range_l, tile_k, tile_l); } -template -void call_wrapper_5d_tile_1d(void* functor, - size_t i, size_t j, size_t k, size_t l, size_t range_m, - size_t tile_m) -{ - (*static_cast(functor))(i, j, k, l, range_m, tile_m); +template +void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, + size_t m) { + (*static_cast(functor))(i, j, k, l, m); } -template -void call_wrapper_5d_tile_2d(void* functor, - size_t i, size_t j, size_t k, size_t range_l, size_t range_m, - size_t tile_l, size_t tile_m) -{ - (*static_cast(functor))(i, j, k, range_l, range_m, tile_l, tile_m); +template +void call_wrapper_5d_tile_1d(void* functor, size_t i, size_t j, size_t k, + size_t l, size_t range_m, size_t tile_m) { + (*static_cast(functor))(i, j, k, l, range_m, tile_m); } -template -void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - (*static_cast(functor))(i, j, k, l, m, n); +template +void call_wrapper_5d_tile_2d(void* functor, size_t i, size_t j, size_t k, + size_t range_l, size_t range_m, size_t tile_l, + size_t tile_m) { + (*static_cast(functor))(i, j, k, range_l, range_m, tile_l, tile_m); } -template -void call_wrapper_6d_tile_1d(void* functor, - size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n, - size_t tile_n) -{ - (*static_cast(functor))(i, j, k, l, m, range_n, tile_n); +template +void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n) { + (*static_cast(functor))(i, j, k, l, m, n); } -template -void call_wrapper_6d_tile_2d(void* functor, - size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n, - size_t tile_m, size_t tile_n) -{ - (*static_cast(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n); +template +void call_wrapper_6d_tile_1d(void* functor, size_t i, size_t j, size_t k, + size_t l, size_t m, size_t range_n, + size_t tile_n) { + (*static_cast(functor))(i, j, k, l, m, range_n, tile_n); } -} /* namespace */ -} /* namespace detail */ -} /* namespace libpthreadpool */ +template +void call_wrapper_6d_tile_2d(void* functor, size_t i, size_t j, size_t k, + size_t l, size_t range_m, size_t range_n, + size_t tile_m, size_t tile_n) { + (*static_cast(functor))(i, j, k, l, range_m, range_n, tile_m, + tile_n); +} + +} /* namespace */ +} /* namespace detail */ +} /* namespace libpthreadpool */ /** * Process items on a 1D grid. @@ -1638,19 +1979,13 @@ void call_wrapper_6d_tile_2d(void* functor, * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range, - uint32_t flags = 0) -{ - pthreadpool_parallelize_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_1d, - const_cast(static_cast(&functor)), - range, - flags); +template +inline void pthreadpool_parallelize_1d(pthreadpool_t threadpool, + const T& functor, size_t range, + uint32_t flags = 0) { + pthreadpool_parallelize_1d( + threadpool, &libpthreadpool::detail::call_wrapper_1d, + const_cast(static_cast(&functor)), range, flags); } /** @@ -1676,21 +2011,58 @@ inline void pthreadpool_parallelize_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_1d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range, - size_t tile, - uint32_t flags = 0) -{ - pthreadpool_parallelize_1d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_1d_tile_1d, - const_cast(static_cast(&functor)), - range, - tile, - flags); +template +inline void pthreadpool_parallelize_1d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range, + size_t tile, + uint32_t flags = 0) { + pthreadpool_parallelize_1d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_1d_tile_1d, + const_cast(static_cast(&functor)), range, tile, + flags); +} + +/** + * Process items on a 1D grid with specified prefered tile size. + * + * The function repeatedly calls + * + * function(context, i, count) + * + * in parallel where `i` is in the range `[0, range)` and a multiple of the + * provided @a tile and `count` is an integer multiple of @a tile unless `i + * + count == range`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range the number of items on the 1D grid to process. + * @param tile the preferred multiple number of items on the 1D grid to + * process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_1d_tile_1d_dynamic(pthreadpool_t threadpool, + const T& functor, + size_t range, + size_t tile, + uint32_t flags = 0) { + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_1d_tile_1d_dynamic, + const_cast(static_cast(&functor)), range, tile, + flags); } /** @@ -1718,21 +2090,14 @@ inline void pthreadpool_parallelize_1d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - uint32_t flags = 0) -{ - pthreadpool_parallelize_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - flags); +template +inline void pthreadpool_parallelize_2d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, uint32_t flags = 0) { + pthreadpool_parallelize_2d( + threadpool, &libpthreadpool::detail::call_wrapper_2d, + const_cast(static_cast(&functor)), range_i, range_j, + flags); } /** @@ -1763,23 +2128,60 @@ inline void pthreadpool_parallelize_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_2d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags = 0) -{ - pthreadpool_parallelize_2d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_2d_tile_1d, - const_cast(static_cast(&functor)), - range_i, - range_j, - tile_j, - flags); +template +inline void pthreadpool_parallelize_2d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t tile_j, + uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_2d_tile_1d, + const_cast(static_cast(&functor)), range_i, range_j, + tile_j, flags); +} + +/** + * Process items on a 2D grid with specified prefered tile size along the + * last grid dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_j) + * + * in parallel where `i` is in the range `[0, range_i)`, `j` is in the range + * `[0, range_j)` and a multiple of the provided @a tile_j, and `count_j` is an + * integer multiple of @a tile_j unless `j + count_j == range_j`. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t tile_j, uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_2d_tile_1d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + tile_j, flags); } /** @@ -1813,25 +2215,65 @@ inline void pthreadpool_parallelize_2d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_2d_tile_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags = 0) -{ - pthreadpool_parallelize_2d_tile_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_2d_tile_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - tile_i, - tile_j, - flags); +template +inline void pthreadpool_parallelize_2d_tile_2d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t tile_i, + size_t tile_j, + uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_2d( + threadpool, &libpthreadpool::detail::call_wrapper_2d_tile_2d, + const_cast(static_cast(&functor)), range_i, range_j, + tile_i, tile_j, flags); +} + +/** + * Process items on a 2D grid with specified prefered tile size along each grid + * dimension. + * + * The function repeatedly calls + * + * function(context, i, j, count_i, count_j) + * + * in parallel where `i` is in the range `[0, range_i)` and a multiple of the + * provided @a tile_i, `j` is in the range `[0, range_j)` and a multiple of the + * provided @a tile_j, and `count_i` and `count_j` are integer multiples of @a + * tile__i and @a tile_j, unless `i + count_i == range_i` or `j + count_j == + * range_j`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 2D + * grid to process. + * @param range_j the number of items on the second dimension of the 2D + * grid to process. + * @param tile_i the preferred multiple number of items on the first + * dimension of the 2D grid to process in each function call. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 2D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t tile_i, size_t tile_j, uint32_t flags = 0) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_2d_tile_2d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + tile_i, tile_j, flags); } /** @@ -1862,23 +2304,15 @@ inline void pthreadpool_parallelize_2d_tile_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_3d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - uint32_t flags = 0) -{ - pthreadpool_parallelize_3d( - threadpool, - &libpthreadpool::detail::call_wrapper_3d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - flags); +template +inline void pthreadpool_parallelize_3d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + uint32_t flags = 0) { + pthreadpool_parallelize_3d( + threadpool, &libpthreadpool::detail::call_wrapper_3d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, flags); } /** @@ -1912,25 +2346,16 @@ inline void pthreadpool_parallelize_3d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_3d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags = 0) -{ - pthreadpool_parallelize_3d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_3d_tile_1d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - tile_k, - flags); +template +inline void pthreadpool_parallelize_3d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t tile_k, + uint32_t flags = 0) { + pthreadpool_parallelize_3d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_3d_tile_1d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, tile_k, flags); } /** @@ -1967,27 +2392,70 @@ inline void pthreadpool_parallelize_3d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_3d_tile_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags = 0) -{ - pthreadpool_parallelize_3d_tile_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_3d_tile_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - tile_j, - tile_k, - flags); +template +inline void pthreadpool_parallelize_3d_tile_2d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, + uint32_t flags = 0) { + pthreadpool_parallelize_3d_tile_2d( + threadpool, &libpthreadpool::detail::call_wrapper_3d_tile_2d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, tile_j, tile_k, flags); +} + +/** + * Process items on a 3D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, count_j, count_k) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)` and a multiple of the provided @a + * tile_j, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `count_j` and `count_k` are integer multiples of @a tile__j and @a tile_k, + * unless `j + count_j == range_j` or `k + count_k == range_k`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 3D + * grid to process. + * @param range_j the number of items on the second dimension of the 3D + * grid to process. + * @param range_k the number of items on the third dimension of the 3D + * grid to process. + * @param tile_j the preferred multiple number of items on the second + * dimension of the 3D grid to process in each function call. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 3D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t range_k, size_t tile_j, size_t tile_k, uint32_t flags = 0) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_3d_tile_2d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, tile_j, tile_k, flags); } /** @@ -2021,25 +2489,15 @@ inline void pthreadpool_parallelize_3d_tile_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_4d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - uint32_t flags = 0) -{ - pthreadpool_parallelize_4d( - threadpool, - &libpthreadpool::detail::call_wrapper_4d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - flags); +template +inline void pthreadpool_parallelize_4d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, uint32_t flags = 0) { + pthreadpool_parallelize_4d( + threadpool, &libpthreadpool::detail::call_wrapper_4d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, flags); } /** @@ -2076,27 +2534,16 @@ inline void pthreadpool_parallelize_4d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_4d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_l, - uint32_t flags = 0) -{ - pthreadpool_parallelize_4d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_4d_tile_1d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - tile_l, - flags); +template +inline void pthreadpool_parallelize_4d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t tile_l, + uint32_t flags = 0) { + pthreadpool_parallelize_4d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_4d_tile_1d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, tile_l, flags); } /** @@ -2136,29 +2583,77 @@ inline void pthreadpool_parallelize_4d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_4d_tile_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags = 0) -{ - pthreadpool_parallelize_4d_tile_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_4d_tile_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - tile_k, - tile_l, - flags); +template +inline void pthreadpool_parallelize_4d_tile_2d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, + size_t tile_l, + uint32_t flags = 0) { + pthreadpool_parallelize_4d_tile_2d( + threadpool, &libpthreadpool::detail::call_wrapper_4d_tile_2d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, tile_k, tile_l, flags); +} + +/** + * Process items on a 4D grid with specified prefered tile size along the last + * two grid dimensions. + * + * The function repeatedly calls + * + * function(context, i, j, k, l, count_k, count_l) + * + * in parallel where: + * - `i` is in the range `[0, range_i)`, + * - `j` is in the range `[0, range_j)`, + * - `k` is in the range `[0, range_k)` and a multiple of the provided @a + * tile_k, + * - `l` is in the range `[0, range_l)` and a multiple of the provided @a + * tile_l, + * - `count_k` and `count_l` are integer multiples of @a tile_k and @a tile_l, + * unless `k + count_k == range_k` or `l + count_l == range_l`, respectivly. + * + * The `count`s are chosen such as to minimize the number of calls to @a + * function while keeping the computation load balanced across all threads. + * + * When the call returns, all items have been processed and the thread pool is + * ready for a new task. + * + * @note If multiple threads call this function with the same thread pool, + * the calls are serialized. + * + * @param threadpool the thread pool to use for parallelisation. If threadpool + * is NULL, all items are processed serially on the calling + * thread. + * @param function the function to call for each interval of the given range. + * @param context the first argument passed to the specified function. + * @param range_i the number of items on the first dimension of the 4D + * grid to process. + * @param range_j the number of items on the second dimension of the 4D + * grid to process. + * @param range_k the number of items on the third dimension of the 4D + * grid to process. + * @param range_l the number of items on the fourth dimension of the 4D + * grid to process. + * @param tile_k the preferred multiple number of items on the third + * dimension of the 4D grid to process in each function call. + * @param tile_l the preferred multiple number of items on the fourth + * dimension of the 4D grid to process in each function call. + * @param flags a bitwise combination of zero or more optional flags + * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or + * PTHREADPOOL_FLAG_YIELD_WORKERS) + */ +template +inline void pthreadpool_parallelize_4d_tile_2d_dynamic( + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t range_k, size_t range_l, size_t tile_k, size_t tile_l, + uint32_t flags = 0) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool, + &libpthreadpool::detail::call_wrapper_4d_tile_2d_dynamic, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, tile_k, tile_l, flags); } /** @@ -2195,27 +2690,16 @@ inline void pthreadpool_parallelize_4d_tile_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_5d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - uint32_t flags = 0) -{ - pthreadpool_parallelize_5d( - threadpool, - &libpthreadpool::detail::call_wrapper_5d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - flags); +template +inline void pthreadpool_parallelize_5d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + uint32_t flags = 0) { + pthreadpool_parallelize_5d( + threadpool, &libpthreadpool::detail::call_wrapper_5d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, flags); } /** @@ -2255,29 +2739,17 @@ inline void pthreadpool_parallelize_5d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_5d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_m, - uint32_t flags = 0) -{ - pthreadpool_parallelize_5d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_5d_tile_1d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - tile_m, - flags); +template +inline void pthreadpool_parallelize_5d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t tile_m, + uint32_t flags = 0) { + pthreadpool_parallelize_5d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_5d_tile_1d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, tile_m, flags); } /** @@ -2320,31 +2792,17 @@ inline void pthreadpool_parallelize_5d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_5d_tile_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_l, - size_t tile_m, - uint32_t flags = 0) -{ - pthreadpool_parallelize_5d_tile_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_5d_tile_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - tile_l, - tile_m, - flags); +template +inline void pthreadpool_parallelize_5d_tile_2d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t tile_l, size_t tile_m, + uint32_t flags = 0) { + pthreadpool_parallelize_5d_tile_2d( + threadpool, &libpthreadpool::detail::call_wrapper_5d_tile_2d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, tile_l, tile_m, flags); } /** @@ -2386,29 +2844,16 @@ inline void pthreadpool_parallelize_5d_tile_2d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_6d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - uint32_t flags = 0) -{ - pthreadpool_parallelize_6d( - threadpool, - &libpthreadpool::detail::call_wrapper_6d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - range_n, - flags); +template +inline void pthreadpool_parallelize_6d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t range_n, uint32_t flags = 0) { + pthreadpool_parallelize_6d( + threadpool, &libpthreadpool::detail::call_wrapper_6d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, range_n, flags); } /** @@ -2451,31 +2896,17 @@ inline void pthreadpool_parallelize_6d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template -inline void pthreadpool_parallelize_6d_tile_1d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_n, - uint32_t flags = 0) -{ - pthreadpool_parallelize_6d_tile_1d( - threadpool, - &libpthreadpool::detail::call_wrapper_6d_tile_1d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - range_n, - tile_n, - flags); +template +inline void pthreadpool_parallelize_6d_tile_1d(pthreadpool_t threadpool, + const T& functor, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + size_t range_n, size_t tile_n, + uint32_t flags = 0) { + pthreadpool_parallelize_6d_tile_1d( + threadpool, &libpthreadpool::detail::call_wrapper_6d_tile_1d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, range_n, tile_n, flags); } /** @@ -2521,35 +2952,17 @@ inline void pthreadpool_parallelize_6d_tile_1d( * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ -template +template inline void pthreadpool_parallelize_6d_tile_2d( - pthreadpool_t threadpool, - const T& functor, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_m, - size_t tile_n, - uint32_t flags = 0) -{ - pthreadpool_parallelize_6d_tile_2d( - threadpool, - &libpthreadpool::detail::call_wrapper_6d_tile_2d, - const_cast(static_cast(&functor)), - range_i, - range_j, - range_k, - range_l, - range_m, - range_n, - tile_m, - tile_n, - flags); + pthreadpool_t threadpool, const T& functor, size_t range_i, size_t range_j, + size_t range_k, size_t range_l, size_t range_m, size_t range_n, + size_t tile_m, size_t tile_n, uint32_t flags = 0) { + pthreadpool_parallelize_6d_tile_2d( + threadpool, &libpthreadpool::detail::call_wrapper_6d_tile_2d, + const_cast(static_cast(&functor)), range_i, range_j, + range_k, range_l, range_m, range_n, tile_m, tile_n, flags); } -#endif /* __cplusplus */ +#endif /* __cplusplus */ -#endif /* PTHREADPOOL_H_ */ +#endif /* __PTHREADPOOL_INCLUDE_PTHREADPOOL_H_ */ diff --git a/scripts/build-android-arm64.sh b/scripts/build-android-arm64.sh new file mode 100755 index 0000000..81fbb6a --- /dev/null +++ b/scripts/build-android-arm64.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e + +if [ -z "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not set; please set it to the Android NDK directory" + exit 1 +fi + +if [ ! -d "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not a directory; did you install it under ${ANDROID_NDK}?" + exit 1 +fi + +mkdir -p build + +CMAKE_ARGS=() + +# CMake-level configuration +CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake") +CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Release") +CMAKE_ARGS+=("-DCMAKE_POSITION_INDEPENDENT_CODE=ON") + +# If Ninja is installed, prefer it to Make +if [ -x "$(command -v ninja)" ] +then + CMAKE_ARGS+=("-GNinja") +fi + +CMAKE_ARGS+=("-DPTHREADPOOL_LIBRARY_TYPE=static") + +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_BENCHMARKS=ON") +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_TESTS=ON") + +# Cross-compilation options for Google Benchmark +CMAKE_ARGS+=("-DHAVE_POSIX_REGEX=0") +CMAKE_ARGS+=("-DHAVE_STEADY_CLOCK=0") +CMAKE_ARGS+=("-DHAVE_STD_REGEX=0") + +# Android-specific options +CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK") +CMAKE_ARGS+=("-DANDROID_ABI=arm64-v8a") +CMAKE_ARGS+=("-DANDROID_PLATFORM=android-21") +CMAKE_ARGS+=("-DANDROID_PIE=ON") +CMAKE_ARGS+=("-DANDROID_STL=c++_static") +CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions") + +# Use-specified CMake arguments go last to allow overridding defaults +CMAKE_ARGS+=($@) + +cd build && cmake .. \ + "${CMAKE_ARGS[@]}" + +# Cross-platform parallel build +if [ "$(uname)" == "Darwin" ] +then + cmake --build . -- "-j$((2*$(sysctl -n hw.ncpu)))" +else + cmake --build . -- "-j$((2*$(nproc)))" +fi diff --git a/scripts/build-android-armv7.sh b/scripts/build-android-armv7.sh new file mode 100755 index 0000000..347f2a8 --- /dev/null +++ b/scripts/build-android-armv7.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e + +if [ -z "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not set; please set it to the Android NDK directory" + exit 1 +fi + +if [ ! -d "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not a directory; did you install it under ${ANDROID_NDK}?" + exit 1 +fi + +mkdir -p build + +CMAKE_ARGS=() + +# CMake-level configuration +CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake") +CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Release") +CMAKE_ARGS+=("-DCMAKE_POSITION_INDEPENDENT_CODE=ON") + +# If Ninja is installed, prefer it to Make +if [ -x "$(command -v ninja)" ] +then + CMAKE_ARGS+=("-GNinja") +fi + +CMAKE_ARGS+=("-DPTHREADPOOL_LIBRARY_TYPE=static") + +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_BENCHMARKS=ON") +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_TESTS=ON") + +# Cross-compilation options for Google Benchmark +CMAKE_ARGS+=("-DHAVE_POSIX_REGEX=0") +CMAKE_ARGS+=("-DHAVE_STEADY_CLOCK=0") +CMAKE_ARGS+=("-DHAVE_STD_REGEX=0") + +# Android-specific options +CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK") +CMAKE_ARGS+=("-DANDROID_ABI=armeabi-v7a") +CMAKE_ARGS+=("-DANDROID_PLATFORM=android-14") +CMAKE_ARGS+=("-DANDROID_PIE=ON") +CMAKE_ARGS+=("-DANDROID_STL=c++_static") +CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions") + +# BF16 instructions cause ICE in Android NDK compiler +CMAKE_ARGS+=("-DPTHREADPOOL_ENABLE_ARM_BF16=OFF") + +# Use-specified CMake arguments go last to allow overridding defaults +CMAKE_ARGS+=($@) + +cd build && cmake .. \ + "${CMAKE_ARGS[@]}" + +# Cross-platform parallel build +if [ "$(uname)" == "Darwin" ] +then + cmake --build . -- "-j$((2*$(sysctl -n hw.ncpu)))" +else + cmake --build . -- "-j$((2*$(nproc)))" +fi diff --git a/scripts/build-android-x86.sh b/scripts/build-android-x86.sh new file mode 100755 index 0000000..0483472 --- /dev/null +++ b/scripts/build-android-x86.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# Copyright 2019 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -e + +if [ -z "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not set; please set it to the Android NDK directory" + exit 1 +fi + +if [ ! -d "$ANDROID_NDK" ] +then + echo "ANDROID_NDK not a directory; did you install it under ${ANDROID_NDK}?" + exit 1 +fi + +mkdir -p build + +CMAKE_ARGS=() + +# CMake-level configuration +CMAKE_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake") +CMAKE_ARGS+=("-DCMAKE_BUILD_TYPE=Release") +CMAKE_ARGS+=("-DCMAKE_POSITION_INDEPENDENT_CODE=ON") + +# If Ninja is installed, prefer it to Make +if [ -x "$(command -v ninja)" ] +then + CMAKE_ARGS+=("-GNinja") +fi + +CMAKE_ARGS+=("-DPTHREADPOOL_LIBRARY_TYPE=static") + +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_BENCHMARKS=ON") +CMAKE_ARGS+=("-DPTHREADPOOL_BUILD_TESTS=ON") + +# Cross-compilation options for Google Benchmark +CMAKE_ARGS+=("-DHAVE_POSIX_REGEX=0") +CMAKE_ARGS+=("-DHAVE_STEADY_CLOCK=0") +CMAKE_ARGS+=("-DHAVE_STD_REGEX=0") + +# Android-specific options +CMAKE_ARGS+=("-DANDROID_NDK=$ANDROID_NDK") +CMAKE_ARGS+=("-DANDROID_ABI=x86") +CMAKE_ARGS+=("-DANDROID_PLATFORM=android-14") +CMAKE_ARGS+=("-DANDROID_PIE=ON") +CMAKE_ARGS+=("-DANDROID_STL=c++_static") +CMAKE_ARGS+=("-DANDROID_CPP_FEATURES=exceptions") + +# Use-specified CMake arguments go last to allow overridding defaults +CMAKE_ARGS+=($@) + +cd build && cmake .. \ + "${CMAKE_ARGS[@]}" + +# Cross-platform parallel build +if [ "$(uname)" == "Darwin" ] +then + cmake --build . -- "-j$((2*$(sysctl -n hw.ncpu)))" +else + cmake --build . -- "-j$((2*$(nproc)))" +fi diff --git a/scripts/build-windows-arm64.cmd b/scripts/build-windows-arm64.cmd new file mode 100755 index 0000000..363727c --- /dev/null +++ b/scripts/build-windows-arm64.cmd @@ -0,0 +1,21 @@ +mkdir build + +rem Set up the Visual Studio environment for arm64 builds. +echo VCVARSALL: %VCVARSALL% +call "%VCVARSALL%" x64_arm64 + +rem Set up the CMake arguments. +set CMAKE_ARGS=-DPTHREADPOOL_LIBRARY_TYPE=static -G="Ninja" -DCMAKE_BUILD_TYPE=Release +set CMAKE_ARGS=%CMAKE_ARGS% -DCMAKE_TOOLCHAIN_FILE=%cd%\cmake\x64_arm64.toolchain + +rem Use-specified CMake arguments go last to allow overridding defaults. +set CMAKE_ARGS=%CMAKE_ARGS% %* + +echo CMAKE_ARGS: %CMAKE_ARGS% + +rem Configure the build. +cd build +cmake .. %CMAKE_ARGS% + +rem Run the build. +cmake --build . --config Release -- -j %NUMBER_OF_PROCESSORS% diff --git a/scripts/build-windows-x64.cmd b/scripts/build-windows-x64.cmd new file mode 100755 index 0000000..7889f20 --- /dev/null +++ b/scripts/build-windows-x64.cmd @@ -0,0 +1,21 @@ +mkdir build + +rem Set up the Visual Studio environment for x64 builds. +echo VCVARSALL: %VCVARSALL% +call "%VCVARSALL%" x64 + +rem Set up the CMake arguments. +set CMAKE_ARGS=-DPTHREADPOOL_LIBRARY_TYPE=static -G="Ninja" -DCMAKE_BUILD_TYPE=Release +rem set CMAKE_ARGS=%CMAKE_ARGS% -DCMAKE_VERBOSE_MAKEFILE=ON + +rem Use-specified CMake arguments go last to allow overridding defaults. +set CMAKE_ARGS=%CMAKE_ARGS% %* + +echo CMAKE_ARGS: %CMAKE_ARGS% + +rem Configure the build. +cd build +cmake .. %CMAKE_ARGS% + +rem Run the build. +cmake --build . --config Release -- -j %NUMBER_OF_PROCESSORS% diff --git a/scripts/build-windows-x86.cmd b/scripts/build-windows-x86.cmd new file mode 100755 index 0000000..75a886c --- /dev/null +++ b/scripts/build-windows-x86.cmd @@ -0,0 +1,21 @@ +mkdir build + +rem Set up the Visual Studio environment for x86 builds. +echo VCVARSALL: %VCVARSALL% +call "%VCVARSALL%" x86 + +rem Set up the CMake arguments. +set CMAKE_ARGS=-DPTHREADPOOL_LIBRARY_TYPE=static -G="Ninja" -DCMAKE_BUILD_TYPE=Release +rem set CMAKE_ARGS=%CMAKE_ARGS% -DCMAKE_VERBOSE_MAKEFILE=ON + +rem Use-specified CMake arguments go last to allow overridding defaults. +set CMAKE_ARGS=%CMAKE_ARGS% %* + +echo CMAKE_ARGS: %CMAKE_ARGS% + +rem Configure the build. +cd build +cmake .. %CMAKE_ARGS% + +rem Run the build. +cmake --build . --config Release -- -j %NUMBER_OF_PROCESSORS% diff --git a/src/fastpath.c b/src/fastpath.c index 64485e7..7c4196c 100644 --- a/src/fastpath.c +++ b/src/fastpath.c @@ -1,3 +1,12 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include #include @@ -6,7 +15,7 @@ #include #if PTHREADPOOL_USE_CPUINFO - #include +#include #endif /* Dependencies */ @@ -21,1698 +30,2009 @@ #include "threadpool-object.h" #include "threadpool-utils.h" - PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, range_start++); - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_t task = + (pthreadpool_task_1d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, range_start++); + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_with_thread_t task = (pthreadpool_task_1d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t thread_number = thread->thread_number; - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, thread_number, range_start++); - } - - /* There still may be other threads with work */ - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, thread_number, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_1d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_with_thread_t task = + (pthreadpool_task_1d_with_thread_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t thread_number = thread->thread_number; + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, thread_number, range_start++); + } + + /* There still may be other threads with work */ + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, thread_number, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, range_start++); - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, uarch_index, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_with_id_t task = + (pthreadpool_task_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, range_start++); + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, uarch_index, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; - size_t tile_start = range_start * tile; - - const size_t range = threadpool->params.parallelize_1d_tile_1d.range; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, tile_start, min(range - tile_start, tile)); - tile_start += tile; - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const size_t tile_start = tile_index * tile; - task(argument, tile_start, min(range - tile_start, tile)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_tile_1d_t task = + (pthreadpool_task_1d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; + size_t tile_start = range_start * tile; + + const size_t range = threadpool->params.parallelize_1d_tile_1d.range; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, tile_start, min(range - tile_start, tile)); + tile_start += tile; + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t tile_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const size_t tile_start = tile_index * tile; + task(argument, tile_start, min(range - tile_start, tile)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j); - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_t task = + (pthreadpool_task_2d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(range_start, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j); + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(linear_index, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_with_thread_t task = (pthreadpool_task_2d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - - const size_t thread_number = thread->thread_number; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, thread_number, i, j); - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); - task(argument, thread_number, index_i_j.quotient, index_i_j.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_with_thread_t task = + (pthreadpool_task_2d_with_thread_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(range_start, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + + const size_t thread_number = thread->thread_number; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, thread_number, i, j); + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(linear_index, range_j); + task(argument, thread_number, index_i_j.quotient, index_i_j.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_t task = + (pthreadpool_task_2d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, start_j, min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, tile_index_i_j.quotient, start_j, + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_with_id_t task = (pthreadpool_task_2d_tile_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_with_id_t task = + (pthreadpool_task_2d_tile_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, i, start_j, min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, tile_index_i_j.quotient, start_j, + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_with_id_with_thread_t task = - (pthreadpool_task_2d_tile_1d_with_id_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; - const size_t thread_number = thread->thread_number; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, thread_number, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, thread_number, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_with_id_with_thread_t task = + (pthreadpool_task_2d_tile_1d_with_id_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; + const size_t thread_number = thread->thread_number; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, thread_number, i, start_j, + min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, thread_number, tile_index_i_j.quotient, + start_j, min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; - const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; - size_t start_i = tile_index_i_j.quotient * tile_i; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; - const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - start_i += tile_i; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_i = tile_index_i_j.quotient * tile_i; - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_2d_t task = + (pthreadpool_task_2d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_2d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; + const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; + size_t start_i = tile_index_i_j.quotient * tile_i; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; + const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, start_i, start_j, min(range_i - start_i, tile_i), + min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + start_i += tile_i; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_i = tile_index_i_j.quotient * tile_i; + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, start_i, start_j, min(range_i - start_i, tile_i), + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; - const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; - const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; - const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; - size_t start_i = index.quotient * tile_i; - size_t start_j = index.remainder * tile_j; - - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - start_i += tile_i; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_i = tile_index_i_j.quotient * tile_i; - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_2d_with_id_t task = + (pthreadpool_task_2d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_result_size_t index = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t range_i = + threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; + const size_t tile_i = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; + const size_t range_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; + const size_t tile_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; + size_t start_i = index.quotient * tile_i; + size_t start_j = index.remainder * tile_j; + + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, start_i, start_j, + min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + start_i += tile_i; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_i = tile_index_i_j.quotient * tile_i; + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, start_i, start_j, + min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k); - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_t task = + (pthreadpool_task_3d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_3d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(range_start, range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k); + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(linear_index, range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_t task = + (pthreadpool_task_3d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, start_k, min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, index_i_j.quotient, index_i_j.remainder, start_k, + min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_thread_t task = (pthreadpool_task_3d_tile_1d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; - const size_t thread_number = thread->thread_number; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, thread_number, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, thread_number, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_thread_t task = + (pthreadpool_task_3d_tile_1d_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; + const size_t thread_number = thread->thread_number; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, thread_number, i, j, start_k, + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, thread_number, index_i_j.quotient, index_i_j.remainder, + start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_id_t task = (pthreadpool_task_3d_tile_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_id_t task = + (pthreadpool_task_3d_tile_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, i, j, start_k, min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, + start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_id_with_thread_t task = - (pthreadpool_task_3d_tile_1d_with_id_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; - const size_t thread_number = thread->thread_number; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, thread_number, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, thread_number, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_id_with_thread_t task = + (pthreadpool_task_3d_tile_1d_with_id_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; + const size_t thread_number = thread->thread_number; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, thread_number, i, j, start_k, + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, thread_number, index_i_j.quotient, + index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; - const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; - const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_2d_t task = + (pthreadpool_task_3d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_2d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_3d_tile_2d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; + const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; + const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, tile_index_i_j.quotient, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; - const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; - const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_2d_with_id_t task = + (pthreadpool_task_3d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; + const size_t tile_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; + const size_t range_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, i, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - - const size_t range_k = threadpool->params.parallelize_4d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l); - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_t task = + (pthreadpool_task_4d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_4d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(range_start, range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_4d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + + const size_t range_k = threadpool->params.parallelize_4d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l); + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(linear_index, range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = tile_index_k_l.quotient; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; - const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_1d_t task = + (pthreadpool_task_4d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_1d.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_1d.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = tile_index_k_l.quotient; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; + const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, index_i_j.quotient, index_i_j.remainder, + tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; - const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_k_l.quotient * tile_k; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; - const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_k = tile_index_k_l.quotient * tile_k; - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_2d_t task = + (pthreadpool_task_4d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_2d.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_2d.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; + const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_k_l.quotient * tile_k; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; + const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), + min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_k = tile_index_k_l.quotient * tile_k; + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, + min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; - const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_k_l.quotient * tile_k; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; - const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_k = tile_index_k_l.quotient * tile_k; - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_2d_with_id_t task = + (pthreadpool_task_4d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_k = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; + const size_t tile_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_k_l.quotient * tile_k; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; + const size_t range_k = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, uarch_index, i, j, start_k, start_l, + min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_k = tile_index_k_l.quotient * tile_k; + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, + start_k, start_l, min(range_k - start_k, tile_k), + min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm; - const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - - const size_t range_l = threadpool->params.parallelize_5d.range_l; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l, m); - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_t task = + (pthreadpool_task_5d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_lm = + threadpool->params.parallelize_5d.range_lm; + const struct fxdiv_result_size_t index_ijk_lm = + fxdiv_divide_size_t(range_start, range_lm); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_5d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_5d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + + const size_t range_l = threadpool->params.parallelize_5d.range_l; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l, m); + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ijk_lm = + fxdiv_divide_size_t(linear_index, range_lm); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m; - const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - size_t start_m = tile_index_ijkl_m.remainder * tile_m; - - const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; - const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - size_t start_m = tile_index_ijkl_m.remainder * tile_m; - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m, - min(range_m - start_m, tile_m)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_tile_1d_t task = + (pthreadpool_task_5d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_m = + threadpool->params.parallelize_5d_tile_1d.tile_range_m; + const struct fxdiv_result_size_t tile_index_ijkl_m = + fxdiv_divide_size_t(range_start, tile_range_m); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_5d_tile_1d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_5d_tile_1d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + + const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; + const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijkl_m = + fxdiv_divide_size_t(linear_index, tile_range_m); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder, start_m, + min(range_m - start_m, tile_m)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm; - const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); - const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m; - const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; - const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t start_l = tile_index_l_m.quotient * tile_l; - size_t start_m = tile_index_l_m.remainder * tile_m; - - const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; - const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); - const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const size_t start_l = tile_index_l_m.quotient * tile_l; - const size_t start_m = tile_index_l_m.remainder * tile_m; - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, - start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_tile_2d_t task = + (pthreadpool_task_5d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_lm = + threadpool->params.parallelize_5d_tile_2d.tile_range_lm; + const struct fxdiv_result_size_t tile_index_ijk_lm = + fxdiv_divide_size_t(range_start, tile_range_lm); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_5d_tile_2d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_m = + threadpool->params.parallelize_5d_tile_2d.tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = + fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; + const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t start_l = tile_index_l_m.quotient * tile_l; + size_t start_m = tile_index_l_m.remainder * tile_m; + + const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; + const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), + min(range_m - start_m, tile_m)); + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijk_lm = + fxdiv_divide_size_t(linear_index, tile_range_lm); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_result_size_t tile_index_l_m = + fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const size_t start_l = tile_index_l_m.quotient * tile_l; + const size_t start_m = tile_index_l_m.remainder * tile_m; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, start_l, start_m, + min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn; - const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); - const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n; - const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - size_t n = index_lm_n.remainder; - - const size_t range_l = threadpool->params.parallelize_6d.range_l; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l, m, n); - if (++n == range_n.value) { - n = 0; - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); - const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_t task = + (pthreadpool_task_6d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_lmn = + threadpool->params.parallelize_6d.range_lmn; + const struct fxdiv_result_size_t index_ijk_lmn = + fxdiv_divide_size_t(range_start, range_lmn); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_6d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t range_n = + threadpool->params.parallelize_6d.range_n; + const struct fxdiv_result_size_t index_lm_n = + fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_6d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_lm_n.quotient, range_m); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + size_t n = index_lm_n.remainder; + + const size_t range_l = threadpool->params.parallelize_6d.range_l; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l, m, n); + if (++n == range_n.value) { + n = 0; + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ijk_lmn = + fxdiv_divide_size_t(linear_index, range_lmn); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); + const struct fxdiv_result_size_t index_lm_n = + fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_lm_n.quotient, range_m); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, + index_lm_n.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; - const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); - const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n; - const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); - const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - size_t start_n = tile_index_lm_n.remainder * tile_n; - - const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; - const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); - start_n += tile_n; - if (start_n >= range_n) { - start_n = 0; - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); - const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); - const size_t start_n = tile_index_lm_n.remainder * tile_n; - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, - start_n, min(range_n - start_n, tile_n)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_tile_1d_t task = + (pthreadpool_task_6d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_lmn = + threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; + const struct fxdiv_result_size_t tile_index_ijk_lmn = + fxdiv_divide_size_t(range_start, tile_range_lmn); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_6d_tile_1d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_n = + threadpool->params.parallelize_6d_tile_1d.tile_range_n; + const struct fxdiv_result_size_t tile_index_lm_n = + fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_6d_tile_1d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); + const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + size_t start_n = tile_index_lm_n.remainder * tile_n; + + const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; + const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); + start_n += tile_n; + if (start_n >= range_n) { + start_n = 0; + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijk_lmn = + fxdiv_divide_size_t(linear_index, tile_range_lmn); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_result_size_t tile_index_lm_n = + fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); + const size_t start_n = tile_index_lm_n.remainder * tile_n; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, + start_n, min(range_n - start_n, tile_n)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread) -{ - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t threads_count = threadpool->threads_count.value; - const size_t range_threshold = -threads_count; - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn; - const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); - const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n; - const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; - const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - size_t start_m = tile_index_m_n.quotient * tile_m; - size_t start_n = tile_index_m_n.remainder * tile_n; - - const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; - const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; - const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; - while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) { - task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); - start_n += tile_n; - if (start_n >= range_n) { - start_n = 0; - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); - const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t start_m = tile_index_m_n.quotient * tile_m; - const size_t start_n = tile_index_m_n.remainder * tile_n; - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, - start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_tile_2d_t task = + (pthreadpool_task_6d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t threads_count = threadpool->threads_count.value; + const size_t range_threshold = -threads_count; + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_mn = + threadpool->params.parallelize_6d_tile_2d.tile_range_mn; + const struct fxdiv_result_size_t tile_index_ijkl_mn = + fxdiv_divide_size_t(range_start, tile_range_mn); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_6d_tile_2d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); + const struct fxdiv_divisor_size_t tile_range_n = + threadpool->params.parallelize_6d_tile_2d.tile_range_n; + const struct fxdiv_result_size_t tile_index_m_n = + fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_6d_tile_2d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; + const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + size_t start_m = tile_index_m_n.quotient * tile_m; + size_t start_n = tile_index_m_n.remainder * tile_n; + + const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; + const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; + const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; + while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < + range_threshold) { + task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), + min(range_n - start_n, tile_n)); + start_n += tile_n; + if (start_n >= range_n) { + start_n = 0; + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_decrement_fetch_relaxed_size_t( + &other_thread->range_length) < range_threshold) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijkl_mn = + fxdiv_divide_size_t(linear_index, tile_range_mn); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); + const struct fxdiv_result_size_t tile_index_m_n = + fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t start_m = tile_index_m_n.quotient * tile_m; + const size_t start_n = tile_index_m_n.remainder * tile_n; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder, start_m, start_n, + min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } diff --git a/src/gcd.c b/src/gcd.c index ddd9af4..fddf499 100644 --- a/src/gcd.c +++ b/src/gcd.c @@ -1,3 +1,12 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include #include @@ -10,8 +19,8 @@ /* Mach headers */ #include -#include #include +#include /* Public library header */ #include @@ -22,115 +31,118 @@ #include "threadpool-utils.h" static void thread_main(void* arg, size_t thread_index) { - struct pthreadpool* threadpool = (struct pthreadpool*) arg; - struct thread_info* thread = &threadpool->threads[thread_index]; + struct pthreadpool* threadpool = (struct pthreadpool*)arg; + struct thread_info* thread = &threadpool->threads[thread_index]; - const uint32_t flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); - const thread_function_t thread_function = - (thread_function_t) pthreadpool_load_relaxed_void_p(&threadpool->thread_function); + const uint32_t flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); + const thread_function_t thread_function = + (thread_function_t)pthreadpool_load_relaxed_void_p( + &threadpool->thread_function); - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } - thread_function(threadpool, thread); + thread_function(threadpool, thread); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } struct pthreadpool* pthreadpool_create(size_t threads_count) { - if (threads_count == 0) { - int threads = 1; - size_t sizeof_threads = sizeof(threads); - if (sysctlbyname("hw.logicalcpu_max", &threads, &sizeof_threads, NULL, 0) != 0) { - return NULL; - } - - if (threads <= 0) { - return NULL; - } - - threads_count = (size_t) threads; - } - - struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); - if (threadpool == NULL) { - return NULL; - } - threadpool->threads_count = fxdiv_init_size_t(threads_count); - for (size_t tid = 0; tid < threads_count; tid++) { - threadpool->threads[tid].thread_number = tid; - } - - /* Thread pool with a single thread computes everything on the caller thread. */ - if (threads_count > 1) { - threadpool->execution_semaphore = dispatch_semaphore_create(1); - } - return threadpool; + if (threads_count == 0) { + int threads = 1; + size_t sizeof_threads = sizeof(threads); + if (sysctlbyname("hw.logicalcpu_max", &threads, &sizeof_threads, NULL, 0) != + 0) { + return NULL; + } + + if (threads <= 0) { + return NULL; + } + + threads_count = (size_t)threads; + } + + struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); + if (threadpool == NULL) { + return NULL; + } + threadpool->threads_count = fxdiv_init_size_t(threads_count); + for (size_t tid = 0; tid < threads_count; tid++) { + threadpool->threads[tid].thread_number = tid; + } + + /* Thread pool with a single thread computes everything on the caller thread. + */ + if (threads_count > 1) { + threadpool->execution_semaphore = dispatch_semaphore_create(1); + } + return threadpool; } PTHREADPOOL_INTERNAL void pthreadpool_parallelize( - struct pthreadpool* threadpool, - thread_function_t thread_function, - const void* params, - size_t params_size, - void* task, - void* context, - size_t linear_range, - uint32_t flags) -{ - assert(threadpool != NULL); - assert(thread_function != NULL); - assert(task != NULL); - assert(linear_range > 1); - - /* Protect the global threadpool structures */ - dispatch_semaphore_wait(threadpool->execution_semaphore, DISPATCH_TIME_FOREVER); - - /* Setup global arguments */ - pthreadpool_store_relaxed_void_p(&threadpool->thread_function, (void*) thread_function); - pthreadpool_store_relaxed_void_p(&threadpool->task, task); - pthreadpool_store_relaxed_void_p(&threadpool->argument, context); - pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); - - /* Locking of completion_mutex not needed: readers are sleeping on command_condvar */ - const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; - - if (params_size != 0) { - memcpy(&threadpool->params, params, params_size); - } - - /* Spread the work between threads */ - const struct fxdiv_result_size_t range_params = fxdiv_divide_size_t(linear_range, threads_count); - size_t range_start = 0; - for (size_t tid = 0; tid < threads_count.value; tid++) { - struct thread_info* thread = &threadpool->threads[tid]; - const size_t range_length = range_params.quotient + (size_t) (tid < range_params.remainder); - const size_t range_end = range_start + range_length; - pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); - pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); - pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); - - /* The next subrange starts where the previous ended */ - range_start = range_end; - } - - dispatch_apply_f(threads_count.value, DISPATCH_APPLY_AUTO, threadpool, thread_main); - - /* Unprotect the global threadpool structures */ - dispatch_semaphore_signal(threadpool->execution_semaphore); + struct pthreadpool* threadpool, thread_function_t thread_function, + const void* params, size_t params_size, void* task, void* context, + size_t linear_range, uint32_t flags) { + assert(threadpool != NULL); + assert(thread_function != NULL); + assert(task != NULL); + assert(linear_range > 1); + + /* Protect the global threadpool structures */ + dispatch_semaphore_wait(threadpool->execution_semaphore, + DISPATCH_TIME_FOREVER); + + /* Setup global arguments */ + pthreadpool_store_relaxed_void_p(&threadpool->thread_function, + (void*)thread_function); + pthreadpool_store_relaxed_void_p(&threadpool->task, task); + pthreadpool_store_relaxed_void_p(&threadpool->argument, context); + pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); + + /* Locking of completion_mutex not needed: readers are sleeping on + * command_condvar */ + const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; + + if (params_size != 0) { + memcpy(&threadpool->params, params, params_size); + } + + /* Spread the work between threads */ + const struct fxdiv_result_size_t range_params = + fxdiv_divide_size_t(linear_range, threads_count); + size_t range_start = 0; + for (size_t tid = 0; tid < threads_count.value; tid++) { + struct thread_info* thread = &threadpool->threads[tid]; + const size_t range_length = + range_params.quotient + (size_t)(tid < range_params.remainder); + const size_t range_end = range_start + range_length; + pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); + pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); + pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); + + /* The next subrange starts where the previous ended */ + range_start = range_end; + } + + dispatch_apply_f(threads_count.value, DISPATCH_APPLY_AUTO, threadpool, + thread_main); + + /* Unprotect the global threadpool structures */ + dispatch_semaphore_signal(threadpool->execution_semaphore); } void pthreadpool_destroy(struct pthreadpool* threadpool) { - if (threadpool != NULL) { - if (threadpool->execution_semaphore != NULL) { - /* Release resources */ - dispatch_release(threadpool->execution_semaphore); - } - pthreadpool_deallocate(threadpool); - } + if (threadpool != NULL) { + if (threadpool->execution_semaphore != NULL) { + /* Release resources */ + dispatch_release(threadpool->execution_semaphore); + } + pthreadpool_deallocate(threadpool); + } } diff --git a/src/legacy-api.c b/src/legacy-api.c index 8d5a6fd..b573511 100644 --- a/src/legacy-api.c +++ b/src/legacy-api.c @@ -1,3 +1,12 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include @@ -8,219 +17,209 @@ #include /* Internal library headers */ +#include "threadpool-common.h" #include "threadpool-utils.h" - -void pthreadpool_compute_1d( - pthreadpool_t threadpool, - pthreadpool_function_1d_t function, - void* argument, - size_t range) -{ - pthreadpool_parallelize_1d(threadpool, - (pthreadpool_task_1d_t) function, argument, - range, 0 /* flags */); +PTHREADPOOL_WEAK void pthreadpool_compute_1d(pthreadpool_t threadpool, + pthreadpool_function_1d_t function, + void* argument, size_t range) { + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t)function, + argument, range, 0 /* flags */); } -void pthreadpool_compute_1d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_1d_tiled_t function, - void* argument, - size_t range, - size_t tile) -{ - pthreadpool_parallelize_1d_tile_1d(threadpool, - (pthreadpool_task_1d_tile_1d_t) function, argument, - range, tile, 0 /* flags */); +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_1d) + +PTHREADPOOL_WEAK void pthreadpool_compute_1d_tiled( + pthreadpool_t threadpool, pthreadpool_function_1d_tiled_t function, + void* argument, size_t range, size_t tile) { + pthreadpool_parallelize_1d_tile_1d(threadpool, + (pthreadpool_task_1d_tile_1d_t)function, + argument, range, tile, 0 /* flags */); } -void pthreadpool_compute_2d( - pthreadpool_t threadpool, - pthreadpool_function_2d_t function, - void* argument, - size_t range_i, - size_t range_j) -{ - pthreadpool_parallelize_2d(threadpool, - (pthreadpool_task_2d_t) function, argument, - range_i, range_j, 0 /* flags */); +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_1d_tiled) + +PTHREADPOOL_WEAK void pthreadpool_compute_2d(pthreadpool_t threadpool, + pthreadpool_function_2d_t function, + void* argument, size_t range_i, + size_t range_j) { + pthreadpool_parallelize_2d(threadpool, (pthreadpool_task_2d_t)function, + argument, range_i, range_j, 0 /* flags */); } -void pthreadpool_compute_2d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_2d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j) -{ - pthreadpool_parallelize_2d_tile_2d(threadpool, - (pthreadpool_task_2d_tile_2d_t) function, argument, - range_i, range_j, tile_i, tile_j, 0 /* flags */); +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_2d) + +PTHREADPOOL_WEAK void pthreadpool_compute_2d_tiled( + pthreadpool_t threadpool, pthreadpool_function_2d_tiled_t function, + void* argument, size_t range_i, size_t range_j, size_t tile_i, + size_t tile_j) { + pthreadpool_parallelize_2d_tile_2d( + threadpool, (pthreadpool_task_2d_tile_2d_t)function, argument, range_i, + range_j, tile_i, tile_j, 0 /* flags */); } +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_2d_tiled) + struct compute_3d_tiled_context { - pthreadpool_function_3d_tiled_t function; - void* argument; - struct fxdiv_divisor_size_t tile_range_j; - struct fxdiv_divisor_size_t tile_range_k; - size_t range_i; - size_t range_j; - size_t range_k; - size_t tile_i; - size_t tile_j; - size_t tile_k; + pthreadpool_function_3d_tiled_t function; + void* argument; + struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t tile_range_k; + size_t range_i; + size_t range_j; + size_t range_k; + size_t tile_i; + size_t tile_j; + size_t tile_k; }; -static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) { - const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t max_tile_i = context->tile_i; - const size_t max_tile_j = context->tile_j; - const size_t max_tile_k = context->tile_k; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; - const size_t index_j = tile_index_i_j.remainder * max_tile_j; - const size_t index_k = tile_index_ij_k.remainder * max_tile_k; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); - const size_t tile_j = min(max_tile_j, context->range_j - index_j); - const size_t tile_k = min(max_tile_k, context->range_k - index_k); - context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); +static void compute_3d_tiled(const struct compute_3d_tiled_context* context, + size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t max_tile_i = context->tile_i; + const size_t max_tile_j = context->tile_j; + const size_t max_tile_k = context->tile_k; + const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_k = tile_index_ij_k.remainder * max_tile_k; + const size_t tile_i = min(max_tile_i, context->range_i - index_i); + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + const size_t tile_k = min(max_tile_k, context->range_k - index_k); + context->function(context->argument, index_i, index_j, index_k, tile_i, + tile_j, tile_k); } -void pthreadpool_compute_3d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_3d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_i, - size_t tile_j, - size_t tile_k) -{ - if (pthreadpool_get_threads_count(threadpool) <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); - } - } - } - } else { - /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range_k = divide_round_up(range_k, tile_k); - struct compute_3d_tiled_context context = { - .function = function, - .argument = argument, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - .range_i = range_i, - .range_j = range_j, - .range_k = range_k, - .tile_i = tile_i, - .tile_j = tile_j, - .tile_k = tile_k - }; - pthreadpool_parallelize_1d(threadpool, - (pthreadpool_task_1d_t) compute_3d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k, - 0 /* flags */); - } +PTHREADPOOL_WEAK void pthreadpool_compute_3d_tiled( + pthreadpool_t threadpool, pthreadpool_function_3d_tiled_t function, + void* argument, size_t range_i, size_t range_j, size_t range_k, + size_t tile_i, size_t tile_j, size_t tile_k) { + if (pthreadpool_get_threads_count(threadpool) <= 1) { + /* No thread pool used: execute function sequentially on the calling thread + */ + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(argument, i, j, k, min(range_i - i, tile_i), + min(range_j - j, tile_j), min(range_k - k, tile_k)); + } + } + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + struct compute_3d_tiled_context context = { + .function = function, + .argument = argument, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .tile_i = tile_i, + .tile_j = tile_j, + .tile_k = tile_k}; + pthreadpool_parallelize_1d( + threadpool, (pthreadpool_task_1d_t)compute_3d_tiled, &context, + tile_range_i * tile_range_j * tile_range_k, 0 /* flags */); + } } +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_3d_tiled) + struct compute_4d_tiled_context { - pthreadpool_function_4d_tiled_t function; - void* argument; - struct fxdiv_divisor_size_t tile_range_kl; - struct fxdiv_divisor_size_t tile_range_j; - struct fxdiv_divisor_size_t tile_range_l; - size_t range_i; - size_t range_j; - size_t range_k; - size_t range_l; - size_t tile_i; - size_t tile_j; - size_t tile_k; - size_t tile_l; + pthreadpool_function_4d_tiled_t function; + void* argument; + struct fxdiv_divisor_size_t tile_range_kl; + struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t tile_range_l; + size_t range_i; + size_t range_j; + size_t range_k; + size_t range_l; + size_t tile_i; + size_t tile_j; + size_t tile_k; + size_t tile_l; }; -static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) { - const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); - const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t max_tile_i = context->tile_i; - const size_t max_tile_j = context->tile_j; - const size_t max_tile_k = context->tile_k; - const size_t max_tile_l = context->tile_l; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; - const size_t index_j = tile_index_i_j.remainder * max_tile_j; - const size_t index_k = tile_index_k_l.quotient * max_tile_k; - const size_t index_l = tile_index_k_l.remainder * max_tile_l; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); - const size_t tile_j = min(max_tile_j, context->range_j - index_j); - const size_t tile_k = min(max_tile_k, context->range_k - index_k); - const size_t tile_l = min(max_tile_l, context->range_l - index_l); - context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l); +static void compute_4d_tiled(const struct compute_4d_tiled_context* context, + size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); + const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t max_tile_i = context->tile_i; + const size_t max_tile_j = context->tile_j; + const size_t max_tile_k = context->tile_k; + const size_t max_tile_l = context->tile_l; + const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_k = tile_index_k_l.quotient * max_tile_k; + const size_t index_l = tile_index_k_l.remainder * max_tile_l; + const size_t tile_i = min(max_tile_i, context->range_i - index_i); + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + const size_t tile_k = min(max_tile_k, context->range_k - index_k); + const size_t tile_l = min(max_tile_l, context->range_l - index_l); + context->function(context->argument, index_i, index_j, index_k, index_l, + tile_i, tile_j, tile_k, tile_l); } -void pthreadpool_compute_4d_tiled( - pthreadpool_t threadpool, - pthreadpool_function_4d_tiled_t function, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_i, - size_t tile_j, - size_t tile_k, - size_t tile_l) -{ - if (pthreadpool_get_threads_count(threadpool) <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - for (size_t l = 0; l < range_l; l += tile_l) { - function(argument, i, j, k, l, - min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); - } - } - } - } - } else { - /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range_l = divide_round_up(range_l, tile_l); - struct compute_4d_tiled_context context = { - .function = function, - .argument = argument, - .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), - .tile_range_j = fxdiv_init_size_t(tile_range_j), - .tile_range_l = fxdiv_init_size_t(tile_range_l), - .range_i = range_i, - .range_j = range_j, - .range_k = range_k, - .range_l = range_l, - .tile_i = tile_i, - .tile_j = tile_j, - .tile_k = tile_k, - .tile_l = tile_l - }; - pthreadpool_parallelize_1d(threadpool, - (pthreadpool_task_1d_t) compute_4d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k * tile_range_l, - 0 /* flags */); - } +PTHREADPOOL_WEAK void pthreadpool_compute_4d_tiled( + pthreadpool_t threadpool, pthreadpool_function_4d_tiled_t function, + void* argument, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_i, size_t tile_j, size_t tile_k, + size_t tile_l) { + if (pthreadpool_get_threads_count(threadpool) <= 1) { + /* No thread pool used: execute function sequentially on the calling thread + */ + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(argument, i, j, k, l, min(range_i - i, tile_i), + min(range_j - j, tile_j), min(range_k - k, tile_k), + min(range_l - l, tile_l)); + } + } + } + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + struct compute_4d_tiled_context context = { + .function = function, + .argument = argument, + .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .range_l = range_l, + .tile_i = tile_i, + .tile_j = tile_j, + .tile_k = tile_k, + .tile_l = tile_l}; + pthreadpool_parallelize_1d( + threadpool, (pthreadpool_task_1d_t)compute_4d_tiled, &context, + tile_range_i * tile_range_j * tile_range_k * tile_range_l, + 0 /* flags */); + } } + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_compute_4d_tiled) diff --git a/src/memory.c b/src/memory.c index fc0d83e..509851f 100644 --- a/src/memory.c +++ b/src/memory.c @@ -1,3 +1,12 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include #include @@ -6,61 +15,74 @@ /* POSIX headers */ #ifdef __ANDROID__ - #include +#include #endif /* Windows headers */ #ifdef _WIN32 - #include +#include #endif /* Internal library headers */ #include "threadpool-common.h" #include "threadpool-object.h" +#if defined(__has_builtin) +#if __has_builtin(__builtin_available) +#define PTHREADPOOL_BUILTIN_AVAILABLE +#endif +#endif + +extern int posix_memalign(void **memptr, size_t alignment, size_t size); PTHREADPOOL_INTERNAL struct pthreadpool* pthreadpool_allocate( - size_t threads_count) -{ - assert(threads_count >= 1); + size_t threads_count) { + assert(threads_count >= 1); - const size_t threadpool_size = sizeof(struct pthreadpool) + threads_count * sizeof(struct thread_info); - struct pthreadpool* threadpool = NULL; - #if defined(__ANDROID__) - /* - * Android didn't get posix_memalign until API level 17 (Android 4.2). - * Use (otherwise obsolete) memalign function on Android platform. - */ - threadpool = memalign(PTHREADPOOL_CACHELINE_SIZE, threadpool_size); - if (threadpool == NULL) { - return NULL; - } - #elif defined(_WIN32) - threadpool = _aligned_malloc(threadpool_size, PTHREADPOOL_CACHELINE_SIZE); - if (threadpool == NULL) { - return NULL; - } - #else - if (posix_memalign((void**) &threadpool, PTHREADPOOL_CACHELINE_SIZE, threadpool_size) != 0) { - return NULL; - } - #endif - memset(threadpool, 0, threadpool_size); - return threadpool; + const size_t threadpool_size = + sizeof(struct pthreadpool) + threads_count * sizeof(struct thread_info); + struct pthreadpool* threadpool = NULL; +#if defined(__ANDROID__) + /* + * Android didn't get posix_memalign until API level 17 (Android 4.2). + * Use (otherwise obsolete) memalign function on Android platform. + */ + threadpool = memalign(PTHREADPOOL_CACHELINE_SIZE, threadpool_size); +#elif defined(_WIN32) + threadpool = _aligned_malloc(threadpool_size, PTHREADPOOL_CACHELINE_SIZE); +#elif _POSIX_C_SOURCE >= 200112L || defined(__hexagon__) + if (posix_memalign((void**)&threadpool, PTHREADPOOL_CACHELINE_SIZE, + threadpool_size) != 0) { + return NULL; + } +#elif defined(PTHREADPOOL_BUILTIN_AVAILABLE) + if (__builtin_available(macOS 10.15, iOS 13, *)) { + threadpool = aligned_alloc(PTHREADPOOL_CACHELINE_SIZE, threadpool_size); + } else { + threadpool = malloc(threadpool_size); + } +#else + threadpool = aligned_alloc(PTHREADPOOL_CACHELINE_SIZE, threadpool_size); +#endif + if (threadpool == NULL) { + return NULL; + } + memset(threadpool, 0, threadpool_size); + return threadpool; } - PTHREADPOOL_INTERNAL void pthreadpool_deallocate( - struct pthreadpool* threadpool) -{ - assert(threadpool != NULL); + struct pthreadpool* threadpool) { + assert(threadpool != NULL); - const size_t threadpool_size = sizeof(struct pthreadpool) + threadpool->threads_count.value * sizeof(struct thread_info); - memset(threadpool, 0, threadpool_size); + const size_t threadpool_size = + sizeof(struct pthreadpool) + + threadpool->threads_count.value * sizeof(struct thread_info); + memset(threadpool, 0, threadpool_size); - #ifdef _WIN32 - _aligned_free(threadpool); - #else - free(threadpool); - #endif +#ifdef _WIN32 + _aligned_free(threadpool); +#else + free(threadpool); +#endif } diff --git a/src/portable-api.c b/src/portable-api.c index 7cd1970..fbb0622 100644 --- a/src/portable-api.c +++ b/src/portable-api.c @@ -1,12 +1,22 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include #include +#include #include #include #include #if PTHREADPOOL_USE_CPUINFO - #include +#include #endif /* Dependencies */ @@ -17,3095 +27,4625 @@ /* Internal library headers */ #include "threadpool-atomics.h" +#include "threadpool-common.h" #include "threadpool-object.h" #include "threadpool-utils.h" +#define PTHREADPOOL_DEFAULT_FASTEST_TO_SLOWEST_RATIO 2 +#define PTHREADPOOL_MAX_FASTEST_TO_SLOWEST_RATIO 4 + +static size_t get_fastest_to_slowest_ratio() { +#if PTHREADPOOL_USE_CPUINFO + // If we are not the fastest core, assume that we are at most 4x slower + // than the fastest core. + return cpuinfo_get_current_uarch_index_with_default(0) > 0 + ? PTHREADPOOL_MAX_FASTEST_TO_SLOWEST_RATIO + : PTHREADPOOL_DEFAULT_FASTEST_TO_SLOWEST_RATIO; +#else + return PTHREADPOOL_DEFAULT_FASTEST_TO_SLOWEST_RATIO; +#endif // PTHREADPOOL_USE_CPUINFO +} + +static size_t get_chunk(pthreadpool_atomic_size_t* num_tiles, + size_t fastest_to_slowest_ratio) { + /* Check whether there are any tiles left. */ + size_t curr_num_tiles = pthreadpool_load_relaxed_size_t(num_tiles); + if (*(ptrdiff_t*)&curr_num_tiles <= 0) { + return 0; + } + + /* Choose a chunk size based on the global remaining amount of work and the + * current number of threads. */ + size_t chunk_size = max(curr_num_tiles / fastest_to_slowest_ratio, 1); + curr_num_tiles = + pthreadpool_fetch_decrement_n_relaxed_size_t(num_tiles, chunk_size); + if (*(ptrdiff_t*)&curr_num_tiles <= 0) { + return 0; + } + return min(chunk_size, curr_num_tiles); +} + +PTHREADPOOL_WEAK size_t +pthreadpool_get_threads_count(struct pthreadpool* threadpool) { + if (threadpool == NULL) { + return 1; + } + + return threadpool->threads_count.value; +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_get_threads_count) + +static void thread_parallelize_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_t task = + (pthreadpool_task_1d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, range_start++); + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_1d_with_thread(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_with_thread_t task = + (pthreadpool_task_1d_with_thread_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const size_t thread_number = thread->thread_number; + /* Process thread's own range of items */ + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, thread_number, range_start++); + } + + /* There still may be other threads with work */ + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, thread_number, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_with_id_t task = + (pthreadpool_task_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, range_start++); + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + task(argument, uarch_index, index); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_1d_tile_1d_t task = + (pthreadpool_task_1d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; + size_t tile_start = range_start * tile; + + const size_t range = threadpool->params.parallelize_1d_tile_1d.range; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, tile_start, min(range - tile_start, tile)); + tile_start += tile; + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t tile_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const size_t tile_start = tile_index * tile; + task(argument, tile_start, min(range - tile_start, tile)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_1d_tile_1d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_1d_tile_1d_dynamic_params* params = + &threadpool->params.parallelize_1d_tile_1d_dynamic; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_i = params->range; + const size_t tile_i = params->tile; + const pthreadpool_task_1d_tile_1d_dynamic_t task = + (pthreadpool_task_1d_tile_1d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Call the task function. */ + const size_t index_i = offset * tile_i; + const size_t step_i = min(tile_i * chunk_size, range_i - index_i); + task(argument, index_i, step_i); + offset += chunk_size; + } + } + + /* Make changes by this thread visible to other threads. */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_t task = + (pthreadpool_task_2d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(range_start, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j); + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(linear_index, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_with_thread(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_with_thread_t task = + (pthreadpool_task_2d_with_thread_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(range_start, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + + const size_t thread_number = thread->thread_number; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, thread_number, i, j); + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(linear_index, range_j); + task(argument, thread_number, index_i_j.quotient, index_i_j.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} -size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { - if (threadpool == NULL) { - return 1; - } - - return threadpool->threads_count.value; -} - -static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, range_start++); - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_1d_with_thread(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_with_thread_t task = (pthreadpool_task_1d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const size_t thread_number = thread->thread_number; - /* Process thread's own range of items */ - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, thread_number, range_start++); - } - - /* There still may be other threads with work */ - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, thread_number, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, range_start++); - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - task(argument, uarch_index, index); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_1d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile; - size_t tile_start = range_start * tile; - - const size_t range = threadpool->params.parallelize_1d_tile_1d.range; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, tile_start, min(range - tile_start, tile)); - tile_start += tile; - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const size_t tile_start = tile_index * tile; - task(argument, tile_start, min(range - tile_start, tile)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j); - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_with_thread(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_with_thread_t task = (pthreadpool_task_2d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - - const size_t thread_number = thread->thread_number; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, thread_number, i, j); - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j); - task(argument, thread_number, index_i_j.quotient, index_i_j.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_tile_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_with_id_t task = (pthreadpool_task_2d_tile_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_tile_1d_with_uarch_with_thread(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_1d_with_id_with_thread_t task = - (pthreadpool_task_2d_tile_1d_with_id_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t thread_number = thread->thread_number; - const size_t range_j = threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, thread_number, i, start_j, min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - - /* There still may be other threads with work */ - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, thread_number, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; - const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; - size_t start_i = tile_index_i_j.quotient * tile_i; - size_t start_j = tile_index_i_j.remainder * tile_j; - - const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; - const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - start_i += tile_i; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_i = tile_index_i_j.quotient * tile_i; - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j); - const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; - const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; - const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; - const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; - size_t start_i = index.quotient * tile_i; - size_t start_j = index.remainder * tile_j; - - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - start_i += tile_i; - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j); - const size_t start_i = tile_index_i_j.quotient * tile_i; - const size_t start_j = tile_index_i_j.remainder * tile_j; - task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k); - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_1d_with_thread(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_thread_t task = (pthreadpool_task_3d_tile_1d_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t thread_number = thread->thread_number; - const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, thread_number, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, thread_number, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_1d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_id_t task = (pthreadpool_task_3d_tile_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_1d_with_uarch_with_thread(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_1d_with_id_with_thread_t task = - (pthreadpool_task_3d_tile_1d_with_id_with_thread_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t tile_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t thread_number = thread->thread_number; - const size_t range_k = threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, thread_number, i, j, start_k, min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, thread_number, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; - const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; - const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); - const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; - const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; - size_t i = tile_index_i_j.quotient; - size_t start_j = tile_index_i_j.remainder * tile_j; - size_t start_k = tile_index_ij_k.remainder * tile_k; - - const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; - const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - start_j += tile_j; - if (start_j >= range_j) { - start_j = 0; - i += 1; - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t start_j = tile_index_i_j.remainder * tile_j; - const size_t start_k = tile_index_ij_k.remainder * tile_k; - task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - - const size_t range_k = threadpool->params.parallelize_4d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l); - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = tile_index_k_l.quotient; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; - const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; - const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_k_l.quotient * tile_k; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; - const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_k = tile_index_k_l.quotient * tile_k; - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; - const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t start_k = tile_index_k_l.quotient * tile_k; - size_t start_l = tile_index_k_l.remainder * tile_l; - - const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; - const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - start_k += tile_k; - if (start_k >= range_k) { - start_k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t start_k = tile_index_k_l.quotient * tile_k; - const size_t start_l = tile_index_k_l.remainder * tile_l; - task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm; - const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - - const size_t range_l = threadpool->params.parallelize_5d.range_l; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l, m); - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m; - const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - size_t start_m = tile_index_ijkl_m.remainder * tile_m; - - const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; - const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - size_t start_m = tile_index_ijkl_m.remainder * tile_m; - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m, - min(range_m - start_m, tile_m)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm; - const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); - const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m; - const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; - const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t start_l = tile_index_l_m.quotient * tile_l; - size_t start_m = tile_index_l_m.remainder * tile_m; - - const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; - const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - start_l += tile_l; - if (start_l >= range_l) { - start_l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); - const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const size_t start_l = tile_index_l_m.quotient * tile_l; - const size_t start_m = tile_index_l_m.remainder * tile_m; - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, - start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_6d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn; - const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); - const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n; - const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - size_t n = index_lm_n.remainder; - - const size_t range_l = threadpool->params.parallelize_6d.range_l; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l, m, n); - if (++n == range_n.value) { - n = 0; - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); - const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m); - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; - const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn); - const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k; - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); - const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n; - const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m; - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); - const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_ij_k.remainder; - size_t l = index_l_m.quotient; - size_t m = index_l_m.remainder; - size_t start_n = tile_index_lm_n.remainder * tile_n; - - const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; - const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); - start_n += tile_n; - if (start_n >= range_n) { - start_n = 0; - if (++m == range_m.value) { - m = 0; - if (++l == range_l) { - l = 0; - if (++k == range_k.value) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); - const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); - const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); - const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); - const size_t start_n = tile_index_lm_n.remainder * tile_n; - task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, - start_n, min(range_n - start_n, tile_n)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { - assert(threadpool != NULL); - assert(thread != NULL); - - const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); - void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); - - /* Process thread's own range of items */ - const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); - const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn; - const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn); - const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl; - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); - const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n; - const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); - const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j; - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l; - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; - const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; - size_t i = index_i_j.quotient; - size_t j = index_i_j.remainder; - size_t k = index_k_l.quotient; - size_t l = index_k_l.remainder; - size_t start_m = tile_index_m_n.quotient * tile_m; - size_t start_n = tile_index_m_n.remainder * tile_n; - - const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; - const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; - const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; - while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { - task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); - start_n += tile_n; - if (start_n >= range_n) { - start_n = 0; - start_m += tile_m; - if (start_m >= range_m) { - start_m = 0; - if (++l == range_l.value) { - l = 0; - if (++k == range_k) { - k = 0; - if (++j == range_j.value) { - j = 0; - i += 1; - } - } - } - } - } - } - - /* There still may be other threads with work */ - const size_t thread_number = thread->thread_number; - const size_t threads_count = threadpool->threads_count.value; - for (size_t tid = modulo_decrement(thread_number, threads_count); - tid != thread_number; - tid = modulo_decrement(tid, threads_count)) - { - struct thread_info* other_thread = &threadpool->threads[tid]; - while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { - const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); - const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn); - const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); - const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); - const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); - const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); - const size_t start_m = tile_index_m_n.quotient * tile_m; - const size_t start_n = tile_index_m_n.remainder * tile_n; - task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, - start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); - } - } - - /* Make changes by this thread visible to other threads */ - pthreadpool_fence_release(); -} - -void pthreadpool_parallelize_1d( - struct pthreadpool* threadpool, - pthreadpool_task_1d_t task, - void* argument, - size_t range, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range; i++) { - task(argument, i); - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - thread_function_t parallelize_1d = &thread_parallelize_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_1d, NULL, 0, - (void*) task, argument, range, flags); - } -} - -void pthreadpool_parallelize_1d_with_thread( - struct pthreadpool* threadpool, - pthreadpool_task_1d_with_thread_t task, - void* argument, - size_t range, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range; i++) { - task(argument, 0, i); - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - thread_function_t parallelize_1d_with_thread = &thread_parallelize_1d_with_thread; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_1d_with_thread = &pthreadpool_thread_parallelize_1d_with_thread_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_1d_with_thread, NULL, 0, - (void*) task, argument, range, flags); - } -} - -void pthreadpool_parallelize_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range; i++) { - task(argument, uarch_index, i); - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const struct pthreadpool_1d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - }; - thread_function_t parallelize_1d_with_uarch = &thread_parallelize_1d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_1d_with_uarch = &pthreadpool_thread_parallelize_1d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_1d_with_uarch, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_1d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_1d_tile_1d_t task, - void* argument, - size_t range, - size_t tile, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range; i += tile) { - task(argument, i, min(range - i, tile)); - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range = divide_round_up(range, tile); - const struct pthreadpool_1d_tile_1d_params params = { - .range = range, - .tile = tile, - }; - thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_1d_tile_1d = &pthreadpool_thread_parallelize_1d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_1d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_2d( - pthreadpool_t threadpool, - pthreadpool_task_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - task(argument, i, j); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range = range_i * range_j; - const struct pthreadpool_2d_params params = { - .range_j = fxdiv_init_size_t(range_j), - }; - thread_function_t parallelize_2d = &thread_parallelize_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_2d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_2d_with_thread_t task, - void* argument, - size_t range_i, - size_t range_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - task(argument, 0, i, j); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range = range_i * range_j; - const struct pthreadpool_2d_params params = { - .range_j = fxdiv_init_size_t(range_j), - }; - thread_function_t parallelize_2d_with_thread = &thread_parallelize_2d_with_thread; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_2d_with_thread = &pthreadpool_thread_parallelize_2d_with_thread_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_with_thread, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_2d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, i, j, min(range_j - j, tile_j)); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range = range_i * tile_range_j; - const struct pthreadpool_2d_tile_1d_params params = { - .range_j = range_j, - .tile_j = tile_j, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - }; - thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_2d_tile_1d = &pthreadpool_thread_parallelize_2d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_2d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, uarch_index, i, j, min(range_j - j, tile_j)); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range = range_i * tile_range_j; - const struct pthreadpool_2d_tile_1d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_j = range_j, - .tile_j = tile_j, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - }; - thread_function_t parallelize_2d_tile_1d_with_uarch = &thread_parallelize_2d_tile_1d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_2d_tile_1d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_tile_1d_with_uarch, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_with_thread_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, uarch_index, 0, i, j, min(range_j - j, tile_j)); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range = range_i * tile_range_j; - const struct pthreadpool_2d_tile_1d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_j = range_j, - .tile_j = tile_j, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - }; - thread_function_t parallelize_2d_tile_1d_with_uarch_with_thread = &thread_parallelize_2d_tile_1d_with_uarch_with_thread; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_2d_tile_1d_with_uarch_with_thread = &pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_tile_1d_with_uarch_with_thread, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_2d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range = tile_range_i * tile_range_j; - const struct pthreadpool_2d_tile_2d_params params = { - .range_i = range_i, - .tile_i = tile_i, - .range_j = range_j, - .tile_j = tile_j, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - }; - thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_2d_tile_2d = &pthreadpool_thread_parallelize_2d_tile_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_tile_2d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_2d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= tile_i && range_j <= tile_j)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, uarch_index, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range = tile_range_i * tile_range_j; - const struct pthreadpool_2d_tile_2d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_i = range_i, - .tile_i = tile_i, - .range_j = range_j, - .tile_j = tile_j, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - }; - thread_function_t parallelize_2d_tile_2d_with_uarch = &thread_parallelize_2d_tile_2d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_2d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_2d_tile_2d_with_uarch, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d( - pthreadpool_t threadpool, - pthreadpool_task_3d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - task(argument, i, j, k); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range = range_i * range_j * range_k; - const struct pthreadpool_3d_params params = { - .range_j = fxdiv_init_size_t(range_j), - .range_k = fxdiv_init_size_t(range_k), - }; - thread_function_t parallelize_3d = &thread_parallelize_3d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, i, j, k, min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * range_j * tile_range_k; - const struct pthreadpool_3d_tile_1d_params params = { - .range_k = range_k, - .tile_k = tile_k, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_thread_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, 0, i, j, k, min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * range_j * tile_range_k; - const struct pthreadpool_3d_tile_1d_params params = { - .range_k = range_k, - .tile_k = tile_k, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_1d_with_thread = &thread_parallelize_3d_tile_1d_with_thread; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_1d_with_thread = &pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_1d_with_thread, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, uarch_index, i, j, k, min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * range_j * tile_range_k; - const struct pthreadpool_3d_tile_1d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_k = range_k, - .tile_k = tile_k, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_1d_with_uarch = &thread_parallelize_3d_tile_1d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_1d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_1d_with_uarch, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_with_thread_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, uarch_index, 0, i, j, k, min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * range_j * tile_range_k; - const struct pthreadpool_3d_tile_1d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_k = range_k, - .tile_k = tile_k, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_1d_with_uarch_with_thread = &thread_parallelize_3d_tile_1d_with_uarch_with_thread; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_1d_with_uarch_with_thread = &pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_1d_with_uarch_with_thread, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * tile_range_j * tile_range_k; - const struct pthreadpool_3d_tile_2d_params params = { - .range_j = range_j, - .tile_j = tile_j, - .range_k = range_k, - .tile_k = tile_k, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_2d = &pthreadpool_thread_parallelize_3d_tile_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_2d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_3d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, uarch_index, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_j = divide_round_up(range_j, tile_j); - const size_t tile_range_k = divide_round_up(range_k, tile_k); - const size_t tile_range = range_i * tile_range_j * tile_range_k; - const struct pthreadpool_3d_tile_2d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_j = range_j, - .tile_j = tile_j, - .range_k = range_k, - .tile_k = tile_k, - .tile_range_j = fxdiv_init_size_t(tile_range_j), - .tile_range_k = fxdiv_init_size_t(tile_range_k), - }; - thread_function_t parallelize_3d_tile_2d_with_uarch = &thread_parallelize_3d_tile_2d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_3d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_3d_tile_2d_with_uarch, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_4d( - pthreadpool_t threadpool, - pthreadpool_task_4d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - task(argument, i, j, k, l); - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range_kl = range_k * range_l; - const size_t range = range_i * range_j * range_kl; - const struct pthreadpool_4d_params params = { - .range_k = range_k, - .range_j = fxdiv_init_size_t(range_j), - .range_kl = fxdiv_init_size_t(range_kl), - .range_l = fxdiv_init_size_t(range_l), - }; - thread_function_t parallelize_4d = &thread_parallelize_4d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_4d, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_4d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_l, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, i, j, k, l, min(range_l - l, tile_l)); - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_l = divide_round_up(range_l, tile_l); - const size_t tile_range_kl = range_k * tile_range_l; - const size_t tile_range = range_i * range_j * tile_range_kl; - const struct pthreadpool_4d_tile_1d_params params = { - .range_k = range_k, - .range_l = range_l, - .tile_l = tile_l, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_kl = fxdiv_init_size_t(tile_range_kl), - .tile_range_l = fxdiv_init_size_t(tile_range_l), - }; - thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_4d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_4d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, i, j, k, l, - min(range_k - k, tile_k), min(range_l - l, tile_l)); - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_l = divide_round_up(range_l, tile_l); - const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l; - const size_t tile_range = range_i * range_j * tile_range_kl; - const struct pthreadpool_4d_tile_2d_params params = { - .range_k = range_k, - .tile_k = tile_k, - .range_l = range_l, - .tile_l = tile_l, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_kl = fxdiv_init_size_t(tile_range_kl), - .tile_range_l = fxdiv_init_size_t(tile_range_l), - }; - thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_4d_tile_2d = &pthreadpool_thread_parallelize_4d_tile_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_4d_tile_2d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_4d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { - /* No thread pool used: execute task sequentially on the calling thread */ - - uint32_t uarch_index = default_uarch_index; - #if PTHREADPOOL_USE_CPUINFO - uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index); - if (uarch_index > max_uarch_index) { - uarch_index = default_uarch_index; - } - #endif - - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, uarch_index, i, j, k, l, - min(range_k - k, tile_k), min(range_l - l, tile_l)); - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_l = divide_round_up(range_l, tile_l); - const size_t tile_range_kl = divide_round_up(range_k, tile_k) * tile_range_l; - const size_t tile_range = range_i * range_j * tile_range_kl; - const struct pthreadpool_4d_tile_2d_with_uarch_params params = { - .default_uarch_index = default_uarch_index, - .max_uarch_index = max_uarch_index, - .range_k = range_k, - .tile_k = tile_k, - .range_l = range_l, - .tile_l = tile_l, - .range_j = fxdiv_init_size_t(range_j), - .tile_range_kl = fxdiv_init_size_t(tile_range_kl), - .tile_range_l = fxdiv_init_size_t(tile_range_l), - }; - thread_function_t parallelize_4d_tile_2d_with_uarch = &thread_parallelize_4d_tile_2d_with_uarch; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_4d_tile_2d_with_uarch = &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_4d_tile_2d_with_uarch, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_5d( - pthreadpool_t threadpool, - pthreadpool_task_5d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - task(argument, i, j, k, l, m); - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range_lm = range_l * range_m; - const size_t range = range_i * range_j * range_k * range_lm; - const struct pthreadpool_5d_params params = { - .range_l = range_l, - .range_j = fxdiv_init_size_t(range_j), - .range_k = fxdiv_init_size_t(range_k), - .range_lm = fxdiv_init_size_t(range_lm), - .range_m = fxdiv_init_size_t(range_m), - }; - thread_function_t parallelize_5d = &thread_parallelize_5d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_5d, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_5d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_m, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m += tile_m) { - task(argument, i, j, k, l, m, min(range_m - m, tile_m)); - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_m = divide_round_up(range_m, tile_m); - const size_t range_kl = range_k * range_l; - const size_t tile_range = range_i * range_j * range_kl * tile_range_m; - const struct pthreadpool_5d_tile_1d_params params = { - .range_k = range_k, - .range_m = range_m, - .tile_m = tile_m, - .range_j = fxdiv_init_size_t(range_j), - .range_kl = fxdiv_init_size_t(range_kl), - .range_l = fxdiv_init_size_t(range_l), - .tile_range_m = fxdiv_init_size_t(tile_range_m), - }; - thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_5d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_5d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_l, - size_t tile_m, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && range_m <= tile_m)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l += tile_l) { - for (size_t m = 0; m < range_m; m += tile_m) { - task(argument, i, j, k, l, m, - min(range_l - l, tile_l), min(range_m - m, tile_m)); - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_m = divide_round_up(range_m, tile_m); - const size_t tile_range_lm = divide_round_up(range_l, tile_l) * tile_range_m; - const size_t tile_range = range_i * range_j * range_k * tile_range_lm; - const struct pthreadpool_5d_tile_2d_params params = { - .range_l = range_l, - .tile_l = tile_l, - .range_m = range_m, - .tile_m = tile_m, - .range_j = fxdiv_init_size_t(range_j), - .range_k = fxdiv_init_size_t(range_k), - .tile_range_lm = fxdiv_init_size_t(tile_range_lm), - .tile_range_m = fxdiv_init_size_t(tile_range_m), - }; - thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_5d_tile_2d = &pthreadpool_thread_parallelize_5d_tile_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_5d_tile_2d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_6d( - pthreadpool_t threadpool, - pthreadpool_task_6d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - for (size_t n = 0; n < range_n; n++) { - task(argument, i, j, k, l, m, n); - } - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range_lmn = range_l * range_m * range_n; - const size_t range = range_i * range_j * range_k * range_lmn; - const struct pthreadpool_6d_params params = { - .range_l = range_l, - .range_j = fxdiv_init_size_t(range_j), - .range_k = fxdiv_init_size_t(range_k), - .range_lmn = fxdiv_init_size_t(range_lmn), - .range_m = fxdiv_init_size_t(range_m), - .range_n = fxdiv_init_size_t(range_n), - }; - thread_function_t parallelize_6d = &thread_parallelize_6d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (range < range_threshold) { - parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_6d, ¶ms, sizeof(params), - task, argument, range, flags); - } -} - -void pthreadpool_parallelize_6d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_n, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l | range_m) <= 1 && range_n <= tile_n)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - for (size_t n = 0; n < range_n; n += tile_n) { - task(argument, i, j, k, l, m, n, min(range_n - n, tile_n)); - } - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t tile_range_n = divide_round_up(range_n, tile_n); - const size_t tile_range_lmn = range_l * range_m * tile_range_n; - const size_t tile_range = range_i * range_j * range_k * tile_range_lmn; - const struct pthreadpool_6d_tile_1d_params params = { - .range_l = range_l, - .range_n = range_n, - .tile_n = tile_n, - .range_j = fxdiv_init_size_t(range_j), - .range_k = fxdiv_init_size_t(range_k), - .tile_range_lmn = fxdiv_init_size_t(tile_range_lmn), - .range_m = fxdiv_init_size_t(range_m), - .tile_range_n = fxdiv_init_size_t(tile_range_n), - }; - thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_6d_tile_1d = &pthreadpool_thread_parallelize_6d_tile_1d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_6d_tile_1d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } -} - -void pthreadpool_parallelize_6d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_m, - size_t tile_n, - uint32_t flags) -{ - size_t threads_count; - if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && range_n <= tile_n)) { - /* No thread pool used: execute task sequentially on the calling thread */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m += tile_m) { - for (size_t n = 0; n < range_n; n += tile_n) { - task(argument, i, j, k, l, m, n, - min(range_m - m, tile_m), min(range_n - n, tile_n)); - } - } - } - } - } - } - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - } else { - const size_t range_kl = range_k * range_l; - const size_t tile_range_n = divide_round_up(range_n, tile_n); - const size_t tile_range_mn = divide_round_up(range_m, tile_m) * tile_range_n; - const size_t tile_range = range_i * range_j * range_kl * tile_range_mn; - const struct pthreadpool_6d_tile_2d_params params = { - .range_k = range_k, - .range_m = range_m, - .tile_m = tile_m, - .range_n = range_n, - .tile_n = tile_n, - .range_j = fxdiv_init_size_t(range_j), - .range_kl = fxdiv_init_size_t(range_kl), - .range_l = fxdiv_init_size_t(range_l), - .tile_range_mn = fxdiv_init_size_t(tile_range_mn), - .tile_range_n = fxdiv_init_size_t(tile_range_n), - }; - thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d; - #if PTHREADPOOL_USE_FASTPATH - const size_t range_threshold = -threads_count; - if (tile_range < range_threshold) { - parallelize_6d_tile_2d = &pthreadpool_thread_parallelize_6d_tile_2d_fastpath; - } - #endif - pthreadpool_parallelize( - threadpool, parallelize_6d_tile_2d, ¶ms, sizeof(params), - task, argument, tile_range, flags); - } +static void thread_parallelize_2d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_t task = + (pthreadpool_task_2d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, start_j, min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, tile_index_i_j.quotient, start_j, + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); } + +static void thread_parallelize_2d_tile_1d_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_with_id_t task = + (pthreadpool_task_2d_tile_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, i, start_j, min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, tile_index_i_j.quotient, start_j, + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_1d_with_uarch_with_thread( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_1d_with_id_with_thread_t task = + (pthreadpool_task_2d_tile_1d_with_id_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.tile_j; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t thread_number = thread->thread_number; + const size_t range_j = + threadpool->params.parallelize_2d_tile_1d_with_uarch.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, thread_number, i, start_j, + min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + + /* There still may be other threads with work */ + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, thread_number, tile_index_i_j.quotient, + start_j, min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_1d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_2d_tile_1d_dynamic_params* params = + &threadpool->params.parallelize_2d_tile_1d_dynamic; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_j = params->range_j; + const size_t tile_j = params->tile_j; + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const pthreadpool_task_2d_tile_1d_dynamic_t task = + (pthreadpool_task_2d_tile_1d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + // Call the task function. + size_t index_i = offset / tile_range_j; + size_t tile_index_j = offset % tile_range_j; + while (chunk_size > 0) { + const size_t index_j = tile_index_j * tile_j; + const size_t tile_step_j = min(chunk_size, tile_range_j - tile_index_j); + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + + task(argument, index_i, index_j, step_j); + + tile_index_j += tile_step_j; + if (tile_range_j <= tile_index_j) { + tile_index_j -= tile_range_j; + index_i += 1; + } + chunk_size -= tile_step_j; + offset += tile_step_j; + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_2d_t task = + (pthreadpool_task_2d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_2d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i; + const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j; + size_t start_i = tile_index_i_j.quotient * tile_i; + size_t start_j = tile_index_i_j.remainder * tile_j; + + const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i; + const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, start_i, start_j, min(range_i - start_i, tile_i), + min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + start_i += tile_i; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_i = tile_index_i_j.quotient * tile_i; + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, start_i, start_j, min(range_i - start_i, tile_i), + min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_2d_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_2d_tile_2d_with_id_t task = + (pthreadpool_task_2d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j; + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_result_size_t index = + fxdiv_divide_size_t(range_start, tile_range_j); + const size_t range_i = + threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i; + const size_t tile_i = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i; + const size_t range_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j; + const size_t tile_j = + threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j; + size_t start_i = index.quotient * tile_i; + size_t start_j = index.remainder * tile_j; + + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, start_i, start_j, + min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + start_i += tile_i; + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t start_i = tile_index_i_j.quotient * tile_i; + const size_t start_j = tile_index_i_j.remainder * tile_j; + task(argument, uarch_index, start_i, start_j, + min(range_i - start_i, tile_i), min(range_j - start_j, tile_j)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_2d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_2d_tile_2d_dynamic_params* params = + &threadpool->params.parallelize_2d_tile_2d_dynamic; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_i = params->range_i; + const size_t range_j = params->range_j; + const size_t tile_i = params->tile_i; + const size_t tile_j = params->tile_j; + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const pthreadpool_task_2d_tile_2d_dynamic_t task = + (pthreadpool_task_2d_tile_2d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t tile_index_i = offset / tile_range_j; + if (tile_range_j == 1) { + /* If there is only a single tile in the `j`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + const size_t index_i = tile_index_i * tile_i; + const size_t tile_step_i = min(tile_range_i - tile_index_i, chunk_size); + const size_t step_i = min(tile_step_i * tile_i, range_i - index_i); + + task(argument, index_i, /*index_j=*/0, step_i, range_j); + + offset += tile_step_i; + } else { + size_t tile_index_j = offset % tile_range_j; + while (chunk_size > 0) { + const size_t index_i = tile_index_i * tile_i; + const size_t index_j = tile_index_j * tile_j; + const size_t step_i = min(tile_i, range_i - index_i); + const size_t tile_step_j = + min(tile_range_j - tile_index_j, chunk_size); + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + + task(argument, index_i, index_j, step_i, step_j); + + tile_index_j += tile_step_j; + if (tile_range_j <= tile_index_j) { + tile_index_j -= tile_range_j; + tile_index_i += 1; + } + chunk_size -= tile_step_j; + offset += tile_step_j; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_2d_tile_2d_dynamic_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Identify the uarch for the current core. + const uint32_t default_uarch_index = + threadpool->params.parallelize_2d_tile_2d_dynamic_with_uarch + .default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > threadpool->params.parallelize_2d_tile_2d_dynamic_with_uarch + .max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + // Get a handle on the params. + struct pthreadpool_2d_tile_2d_dynamic_with_uarch_params* params = + &threadpool->params.parallelize_2d_tile_2d_dynamic_with_uarch; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_i = params->range_i; + const size_t range_j = params->range_j; + const size_t tile_i = params->tile_i; + const size_t tile_j = params->tile_j; + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const pthreadpool_task_2d_tile_2d_dynamic_with_id_t task = + (pthreadpool_task_2d_tile_2d_dynamic_with_id_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t tile_index_i = offset / tile_range_j; + if (tile_range_j == 1) { + /* If there is only a single tile in the `j`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + const size_t index_i = tile_index_i * tile_i; + const size_t tile_step_i = min(tile_range_i - tile_index_i, chunk_size); + const size_t step_i = min(tile_step_i * tile_i, range_i - index_i); + + task(argument, uarch_index, index_i, /*index_j=*/0, step_i, range_j); + + offset += tile_step_i; + } else { + size_t tile_index_j = offset % tile_range_j; + while (chunk_size > 0) { + const size_t index_i = tile_index_i * tile_i; + const size_t index_j = tile_index_j * tile_j; + const size_t step_i = min(tile_i, range_i - index_i); + const size_t tile_step_j = + min(tile_range_j - tile_index_j, chunk_size); + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + + task(argument, uarch_index, index_i, index_j, step_i, step_j); + + tile_index_j += tile_step_j; + if (tile_range_j <= tile_index_j) { + tile_index_j -= tile_range_j; + tile_index_i += 1; + } + chunk_size -= tile_step_j; + offset += tile_step_j; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_t task = + (pthreadpool_task_3d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_3d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(range_start, range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k); + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(linear_index, range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_t task = + (pthreadpool_task_3d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, start_k, min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, index_i_j.quotient, index_i_j.remainder, start_k, + min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_1d_with_thread( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_thread_t task = + (pthreadpool_task_3d_tile_1d_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t thread_number = thread->thread_number; + const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, thread_number, i, j, start_k, + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, thread_number, index_i_j.quotient, index_i_j.remainder, + start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_1d_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_id_t task = + (pthreadpool_task_3d_tile_1d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, i, j, start_k, min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, + start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_1d_with_uarch_with_thread( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_with_id_with_thread_t task = + (pthreadpool_task_3d_tile_1d_with_id_with_thread_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_1d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_1d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t thread_number = thread->thread_number; + const size_t range_k = + threadpool->params.parallelize_3d_tile_1d_with_uarch.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, thread_number, i, j, start_k, + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, thread_number, index_i_j.quotient, + index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_2d_t task = + (pthreadpool_task_3d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_2d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_3d_tile_2d.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j; + const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k; + const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), + min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, tile_index_i_j.quotient, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_2d_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_2d_with_id_t task = + (pthreadpool_task_3d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t tile_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j; + const size_t tile_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k; + size_t i = tile_index_i_j.quotient; + size_t start_j = tile_index_i_j.remainder * tile_j; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = + threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k; + const size_t range_j = + threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, i, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + start_j += tile_j; + if (start_j >= range_j) { + start_j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = + fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t tile_index_i_j = + fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t start_j = tile_index_i_j.remainder * tile_j; + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, + min(range_j - start_j, tile_j), min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_2d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_3d_tile_2d_dynamic_params* params = + &threadpool->params.parallelize_3d_tile_2d_dynamic; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_j = params->range_j; + const size_t range_k = params->range_k; + const size_t tile_j = params->tile_j; + const size_t tile_k = params->tile_k; + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const pthreadpool_task_3d_tile_2d_dynamic_t task = + (pthreadpool_task_3d_tile_2d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t index_i = offset / (tile_range_j * tile_range_k); + size_t tile_index_j = (offset / tile_range_k) % tile_range_j; + if (tile_range_k == 1) { + /* If there is only a single tile in the `k`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + while (chunk_size > 0) { + const size_t index_j = tile_index_j * tile_j; + const size_t tile_step_j = + min(tile_range_j - tile_index_j, chunk_size); + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + + task(argument, index_i, index_j, /*index_k=*/0, step_j, range_k); + + tile_index_j += tile_step_j; + if (tile_range_j <= tile_index_j) { + tile_index_j -= tile_range_j; + index_i += 1; + } + chunk_size -= tile_step_j; + offset += tile_step_j; + } + } else { + size_t tile_index_k = offset % tile_range_k; + while (chunk_size > 0) { + const size_t index_j = tile_index_j * tile_j; + const size_t index_k = tile_index_k * tile_k; + const size_t step_j = min(tile_j, range_j - index_j); + const size_t tile_step_k = + min(tile_range_k - tile_index_k, chunk_size); + const size_t step_k = min(tile_step_k * tile_k, range_k - index_k); + + task(argument, index_i, index_j, index_k, step_j, step_k); + + tile_index_k += tile_step_k; + if (tile_range_k <= tile_index_k) { + tile_index_k -= tile_range_k; + if (tile_range_j <= ++tile_index_j) { + tile_index_j = 0; + index_i += 1; + } + } + chunk_size -= tile_step_k; + offset += tile_step_k; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_2d_dynamic_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get the uarch. + const uint32_t default_uarch_index = + threadpool->params.parallelize_3d_tile_2d_dynamic_with_uarch + .default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > threadpool->params.parallelize_3d_tile_2d_dynamic_with_uarch + .max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + // Get a handle on the params. + struct pthreadpool_3d_tile_2d_dynamic_with_uarch_params* params = + &threadpool->params.parallelize_3d_tile_2d_dynamic_with_uarch; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_j = params->range_j; + const size_t range_k = params->range_k; + const size_t tile_j = params->tile_j; + const size_t tile_k = params->tile_k; + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const pthreadpool_task_3d_tile_2d_dynamic_with_id_t task = + (pthreadpool_task_3d_tile_2d_dynamic_with_id_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t index_i = offset / (tile_range_j * tile_range_k); + size_t tile_index_j = (offset / tile_range_k) % tile_range_j; + if (tile_range_k == 1) { + /* If there is only a single tile in the `k`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + while (chunk_size > 0) { + const size_t index_j = tile_index_j * tile_j; + const size_t tile_step_j = + min(tile_range_j - tile_index_j, chunk_size); + const size_t step_j = min(tile_step_j * tile_j, range_j - index_j); + + task(argument, uarch_index, index_i, index_j, /*index_k=*/0, step_j, + range_k); + + tile_index_j += tile_step_j; + if (tile_range_j <= tile_index_j) { + tile_index_j -= tile_range_j; + index_i += 1; + } + chunk_size -= tile_step_j; + offset += tile_step_j; + } + } else { + size_t tile_index_k = offset % tile_range_k; + while (chunk_size > 0) { + const size_t index_j = tile_index_j * tile_j; + const size_t index_k = tile_index_k * tile_k; + const size_t step_j = min(tile_j, range_j - index_j); + const size_t tile_step_k = + min(tile_range_k - tile_index_k, chunk_size); + const size_t step_k = min(tile_step_k * tile_k, range_k - index_k); + + task(argument, uarch_index, index_i, index_j, index_k, step_j, + step_k); + + tile_index_k += tile_step_k; + if (tile_range_k <= tile_index_k) { + tile_index_k -= tile_range_k; + if (tile_range_j <= ++tile_index_j) { + tile_index_j = 0; + index_i += 1; + } + } + chunk_size -= tile_step_k; + offset += tile_step_k; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_t task = + (pthreadpool_task_4d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_4d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(range_start, range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_4d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + + const size_t range_k = threadpool->params.parallelize_4d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l); + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(linear_index, range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_1d_t task = + (pthreadpool_task_4d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_1d.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_1d.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = tile_index_k_l.quotient; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; + const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, index_i_j.quotient, index_i_j.remainder, + tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_2d_t task = + (pthreadpool_task_4d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_2d.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_2d.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k; + const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_k_l.quotient * tile_k; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l; + const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), + min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_k = tile_index_k_l.quotient * tile_k; + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, + min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_2d_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_2d_with_id_t task = + (pthreadpool_task_4d_tile_2d_with_id_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + const uint32_t default_uarch_index = + threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > + threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_k = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k; + const size_t tile_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_k_l.quotient * tile_k; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_l = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l; + const size_t range_k = + threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, uarch_index, i, j, start_k, start_l, + min(range_k - start_k, tile_k), min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = + fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = + fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_k = tile_index_k_l.quotient * tile_k; + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, + start_k, start_l, min(range_k - start_k, tile_k), + min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_2d_dynamic( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get a handle on the params. + struct pthreadpool_4d_tile_2d_dynamic_params* params = + &threadpool->params.parallelize_4d_tile_2d_dynamic; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_j = params->range_j; + const size_t range_k = params->range_k; + const size_t range_l = params->range_l; + const size_t tile_k = params->tile_k; + const size_t tile_l = params->tile_l; + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const pthreadpool_task_4d_tile_2d_dynamic_t task = + (pthreadpool_task_4d_tile_2d_dynamic_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t index_i = offset / (range_j * tile_range_k * tile_range_l); + size_t index_j = (offset / (tile_range_k * tile_range_l)) % range_j; + size_t tile_index_k = (offset / tile_range_l) % tile_range_k; + if (tile_range_l == 1) { + /* If there is only a single tile in the `k`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + while (chunk_size > 0) { + const size_t index_k = tile_index_k * tile_k; + const size_t tile_step_k = + min(tile_range_k - tile_index_k, chunk_size); + const size_t step_k = min(tile_step_k * tile_k, range_k - index_k); + + task(argument, index_i, index_j, index_k, /*index_k=*/0, step_k, + range_l); + + tile_index_k += tile_step_k; + if (tile_range_k <= tile_index_k) { + tile_index_k -= tile_range_k; + index_j += 1; + if (range_j <= index_j) { + index_j = 0; + index_i += 1; + } + } + chunk_size -= tile_step_k; + offset += tile_step_k; + } + } else { + size_t tile_index_l = offset % tile_range_l; + while (chunk_size > 0) { + const size_t index_k = tile_index_k * tile_k; + const size_t index_l = tile_index_l * tile_l; + const size_t step_k = min(tile_k, range_k - index_k); + const size_t tile_step_l = + min(tile_range_l - tile_index_l, chunk_size); + const size_t step_l = min(tile_step_l * tile_l, range_l - index_l); + + task(argument, index_i, index_j, index_k, index_l, step_k, step_l); + + tile_index_l += tile_step_l; + if (tile_range_l <= tile_index_l) { + tile_index_l -= tile_range_l; + if (tile_range_k <= ++tile_index_k) { + tile_index_k = 0; + index_j += 1; + if (range_j <= index_j) { + index_j = 0; + index_i += 1; + } + } + } + chunk_size -= tile_step_l; + offset += tile_step_l; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_2d_dynamic_with_uarch( + struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + // Get the uarch. + const uint32_t default_uarch_index = + threadpool->params.parallelize_4d_tile_2d_dynamic_with_uarch + .default_uarch_index; + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > threadpool->params.parallelize_4d_tile_2d_dynamic_with_uarch + .max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + // Get a handle on the params. + struct pthreadpool_4d_tile_2d_dynamic_with_uarch_params* params = + &threadpool->params.parallelize_4d_tile_2d_dynamic_with_uarch; + const size_t num_threads = threadpool->threads_count.value; + const size_t range_j = params->range_j; + const size_t range_k = params->range_k; + const size_t range_l = params->range_l; + const size_t tile_k = params->tile_k; + const size_t tile_l = params->tile_l; + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const pthreadpool_task_4d_tile_2d_dynamic_with_id_t task = + (pthreadpool_task_4d_tile_2d_dynamic_with_id_t) + pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + const size_t thread_number = thread->thread_number; + const size_t fastest_to_slowest_ratio = get_fastest_to_slowest_ratio(); + + // Do tiles in our own range first (tid = 0), then the other ranges when we're + // done. + for (size_t tid = 0; tid < num_threads; tid++) { + struct thread_info* thread = + &threadpool->threads[(num_threads + thread_number - tid) % num_threads]; + + size_t offset = + (tid == 0) ? pthreadpool_load_relaxed_size_t(&thread->range_start) : 0; + + /* Loop as long as there is work to be done. */ + while (true) { + /* Choose a chunk size based on the remaining amount of work and the + * current number of threads. */ + size_t chunk_size = + get_chunk(&thread->range_length, fastest_to_slowest_ratio); + if (!chunk_size) { + break; + } + + /* If this is "our" range, take chunks of tiles from the front, otherwise + * take them from the back. */ + if (tid != 0) { + offset = pthreadpool_decrement_n_fetch_relaxed_size_t( + &thread->range_end, chunk_size); + } + + /* Iterate over the chunk and call the task function. */ + size_t index_i = offset / (range_j * tile_range_k * tile_range_l); + size_t index_j = (offset / (tile_range_k * tile_range_l)) % range_j; + size_t tile_index_k = (offset / tile_range_l) % tile_range_k; + if (tile_range_l == 1) { + /* If there is only a single tile in the `k`th (last) dimension, then we + * group by the `j`th (second-last) dimeension. */ + while (chunk_size > 0) { + const size_t index_k = tile_index_k * tile_k; + const size_t tile_step_k = + min(tile_range_k - tile_index_k, chunk_size); + const size_t step_k = min(tile_step_k * tile_k, range_k - index_k); + + task(argument, uarch_index, index_i, index_j, index_k, /*index_k=*/0, + step_k, range_l); + + tile_index_k += tile_step_k; + if (tile_range_k <= tile_index_k) { + tile_index_k -= tile_range_k; + index_j += 1; + if (range_j <= index_j) { + index_j = 0; + index_i += 1; + } + } + chunk_size -= tile_step_k; + offset += tile_step_k; + } + } else { + size_t tile_index_l = offset % tile_range_l; + while (chunk_size > 0) { + const size_t index_k = tile_index_k * tile_k; + const size_t index_l = tile_index_l * tile_l; + const size_t step_k = min(tile_k, range_k - index_k); + const size_t tile_step_l = + min(tile_range_l - tile_index_l, chunk_size); + const size_t step_l = min(tile_step_l * tile_l, range_l - index_l); + + task(argument, uarch_index, index_i, index_j, index_k, index_l, + step_k, step_l); + + tile_index_l += tile_step_l; + if (tile_range_l <= tile_index_l) { + tile_index_l -= tile_range_l; + if (tile_range_k <= ++tile_index_k) { + tile_index_k = 0; + index_j += 1; + if (range_j <= index_j) { + index_j = 0; + index_i += 1; + } + } + } + chunk_size -= tile_step_l; + offset += tile_step_l; + } + } + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_5d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_t task = + (pthreadpool_task_5d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_lm = + threadpool->params.parallelize_5d.range_lm; + const struct fxdiv_result_size_t index_ijk_lm = + fxdiv_divide_size_t(range_start, range_lm); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_5d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_5d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + + const size_t range_l = threadpool->params.parallelize_5d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, m); + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ijk_lm = + fxdiv_divide_size_t(linear_index, range_lm); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_tile_1d_t task = + (pthreadpool_task_5d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_m = + threadpool->params.parallelize_5d_tile_1d.tile_range_m; + const struct fxdiv_result_size_t tile_index_ijkl_m = + fxdiv_divide_size_t(range_start, tile_range_m); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_5d_tile_1d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_5d_tile_1d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + + const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; + const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijkl_m = + fxdiv_divide_size_t(linear_index, tile_range_m); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder, start_m, + min(range_m - start_m, tile_m)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_tile_2d_t task = + (pthreadpool_task_5d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_lm = + threadpool->params.parallelize_5d_tile_2d.tile_range_lm; + const struct fxdiv_result_size_t tile_index_ijk_lm = + fxdiv_divide_size_t(range_start, tile_range_lm); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_5d_tile_2d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_m = + threadpool->params.parallelize_5d_tile_2d.tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = + fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_5d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l; + const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t start_l = tile_index_l_m.quotient * tile_l; + size_t start_m = tile_index_l_m.remainder * tile_m; + + const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m; + const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), + min(range_m - start_m, tile_m)); + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijk_lm = + fxdiv_divide_size_t(linear_index, tile_range_lm); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_result_size_t tile_index_l_m = + fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const size_t start_l = tile_index_l_m.quotient * tile_l; + const size_t start_m = tile_index_l_m.remainder * tile_m; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, start_l, start_m, + min(range_l - start_l, tile_l), min(range_m - start_m, tile_m)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_6d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_t task = + (pthreadpool_task_6d_t)pthreadpool_load_relaxed_void_p(&threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_lmn = + threadpool->params.parallelize_6d.range_lmn; + const struct fxdiv_result_size_t index_ijk_lmn = + fxdiv_divide_size_t(range_start, range_lmn); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_6d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t range_n = + threadpool->params.parallelize_6d.range_n; + const struct fxdiv_result_size_t index_lm_n = + fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_6d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_lm_n.quotient, range_m); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + size_t n = index_lm_n.remainder; + + const size_t range_l = threadpool->params.parallelize_6d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, m, n); + if (++n == range_n.value) { + n = 0; + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ijk_lmn = + fxdiv_divide_size_t(linear_index, range_lmn); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k); + const struct fxdiv_result_size_t index_lm_n = + fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(index_lm_n.quotient, range_m); + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, + index_lm_n.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_6d_tile_1d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_tile_1d_t task = + (pthreadpool_task_6d_tile_1d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_lmn = + threadpool->params.parallelize_6d_tile_1d.tile_range_lmn; + const struct fxdiv_result_size_t tile_index_ijk_lmn = + fxdiv_divide_size_t(range_start, tile_range_lmn); + const struct fxdiv_divisor_size_t range_k = + threadpool->params.parallelize_6d_tile_1d.range_k; + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_n = + threadpool->params.parallelize_6d_tile_1d.tile_range_n; + const struct fxdiv_result_size_t tile_index_lm_n = + fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t range_m = + threadpool->params.parallelize_6d_tile_1d.range_m; + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); + const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + size_t start_n = tile_index_lm_n.remainder * tile_n; + + const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n; + const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n)); + start_n += tile_n; + if (start_n >= range_n) { + start_n = 0; + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijk_lmn = + fxdiv_divide_size_t(linear_index, tile_range_lmn); + const struct fxdiv_result_size_t index_ij_k = + fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_result_size_t tile_index_lm_n = + fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_k.quotient, range_j); + const struct fxdiv_result_size_t index_l_m = + fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m); + const size_t start_n = tile_index_lm_n.remainder * tile_n; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, + start_n, min(range_n - start_n, tile_n)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_6d_tile_2d(struct pthreadpool* threadpool, + struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_6d_tile_2d_t task = + (pthreadpool_task_6d_tile_2d_t)pthreadpool_load_relaxed_void_p( + &threadpool->task); + void* const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = + pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_mn = + threadpool->params.parallelize_6d_tile_2d.tile_range_mn; + const struct fxdiv_result_size_t tile_index_ijkl_mn = + fxdiv_divide_size_t(range_start, tile_range_mn); + const struct fxdiv_divisor_size_t range_kl = + threadpool->params.parallelize_6d_tile_2d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); + const struct fxdiv_divisor_size_t tile_range_n = + threadpool->params.parallelize_6d_tile_2d.tile_range_n; + const struct fxdiv_result_size_t tile_index_m_n = + fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = + threadpool->params.parallelize_6d_tile_2d.range_j; + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = + threadpool->params.parallelize_6d_tile_2d.range_l; + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m; + const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + size_t start_m = tile_index_m_n.quotient * tile_m; + size_t start_n = tile_index_m_n.remainder * tile_n; + + const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n; + const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m; + const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), + min(range_n - start_n, tile_n)); + start_n += tile_n; + if (start_n >= range_n) { + start_n = 0; + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; tid = modulo_decrement(tid, threads_count)) { + struct thread_info* other_thread = &threadpool->threads[tid]; + while ( + pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = + pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijkl_mn = + fxdiv_divide_size_t(linear_index, tile_range_mn); + const struct fxdiv_result_size_t index_ij_kl = + fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl); + const struct fxdiv_result_size_t tile_index_m_n = + fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n); + const struct fxdiv_result_size_t index_i_j = + fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = + fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t start_m = tile_index_m_n.quotient * tile_m; + const size_t start_n = tile_index_m_n.remainder * tile_n; + task(argument, index_i_j.quotient, index_i_j.remainder, + index_k_l.quotient, index_k_l.remainder, start_m, start_n, + min(range_m - start_m, tile_m), min(range_n - start_n, tile_n)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d(struct pthreadpool* threadpool, + pthreadpool_task_1d_t function, + void* context, size_t range, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range; i++) { + function(context, i); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + thread_function_t parallelize_1d = &thread_parallelize_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_1d = &pthreadpool_thread_parallelize_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_1d, NULL, 0, + (void*)function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_thread( + struct pthreadpool* threadpool, pthreadpool_task_1d_with_thread_t function, + void* context, size_t range, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range; i++) { + function(context, 0, i); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + thread_function_t parallelize_1d_with_thread = + &thread_parallelize_1d_with_thread; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_1d_with_thread = + &pthreadpool_thread_parallelize_1d_with_thread_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_1d_with_thread, NULL, 0, + (void*)function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range; i++) { + function(context, uarch_index, i); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const struct pthreadpool_1d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + }; + thread_function_t parallelize_1d_with_uarch = + &thread_parallelize_1d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_1d_with_uarch = + &pthreadpool_thread_parallelize_1d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_1d_with_uarch, ¶ms, + sizeof(params), function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_t function, + void* context, size_t range, size_t tile, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range; i += tile) { + function(context, i, min(range - i, tile)); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range = divide_round_up(range, tile); + const struct pthreadpool_1d_tile_1d_params params = { + .range = range, + .tile = tile, + }; + thread_function_t parallelize_1d_tile_1d = &thread_parallelize_1d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_1d_tile_1d = + &pthreadpool_thread_parallelize_1d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_1d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function, + void* context, size_t range, size_t tile, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || range <= tile) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + function(context, 0, range); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range = divide_round_up(range, tile); + const struct pthreadpool_1d_tile_1d_dynamic_params params = { + .range = range, + .tile = tile, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_1d_tile_1d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_tile_1d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d(pthreadpool_t threadpool, + pthreadpool_task_2d_t function, + void* context, size_t range_i, + size_t range_j, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + function(context, i, j); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range = range_i * range_j; + const struct pthreadpool_2d_params params = { + .range_j = fxdiv_init_size_t(range_j), + }; + thread_function_t parallelize_2d = &thread_parallelize_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_2d = &pthreadpool_thread_parallelize_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d, ¶ms, sizeof(params), + function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_with_thread( + pthreadpool_t threadpool, pthreadpool_task_2d_with_thread_t function, + void* context, size_t range_i, size_t range_j, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + function(context, 0, i, j); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range = range_i * range_j; + const struct pthreadpool_2d_params params = { + .range_j = fxdiv_init_size_t(range_j), + }; + thread_function_t parallelize_2d_with_thread = + &thread_parallelize_2d_with_thread; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_2d_with_thread = + &pthreadpool_thread_parallelize_2d_with_thread_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d_with_thread, ¶ms, + sizeof(params), function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= 1 && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, i, j, min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = range_i * tile_range_j; + const struct pthreadpool_2d_tile_1d_params params = { + .range_j = range_j, + .tile_j = tile_j, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + }; + thread_function_t parallelize_2d_tile_1d = &thread_parallelize_2d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_2d_tile_1d = + &pthreadpool_thread_parallelize_2d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_j, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= 1 && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, uarch_index, i, j, min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = range_i * tile_range_j; + const struct pthreadpool_2d_tile_1d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_j = range_j, + .tile_j = tile_j, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + }; + thread_function_t parallelize_2d_tile_1d_with_uarch = + &thread_parallelize_2d_tile_1d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_2d_tile_1d_with_uarch = + &pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d_tile_1d_with_uarch, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, index_i, /*index_j=*/0, range_j); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = range_i * tile_range_j; + const struct pthreadpool_2d_tile_1d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .tile_j = tile_j, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_2d_tile_1d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_j, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= 1 && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, uarch_index, 0, i, j, min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = range_i * tile_range_j; + const struct pthreadpool_2d_tile_1d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_j = range_j, + .tile_j = tile_j, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + }; + thread_function_t parallelize_2d_tile_1d_with_uarch_with_thread = + &thread_parallelize_2d_tile_1d_with_uarch_with_thread; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_2d_tile_1d_with_uarch_with_thread = + &pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath; + } +#endif + pthreadpool_parallelize( + threadpool, parallelize_2d_tile_1d_with_uarch_with_thread, ¶ms, + sizeof(params), function, context, tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= tile_i && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, i, j, min(range_i - i, tile_i), + min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = tile_range_i * tile_range_j; + const struct pthreadpool_2d_tile_2d_params params = { + .range_i = range_i, + .tile_i = tile_i, + .range_j = range_j, + .tile_j = tile_j, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + }; + thread_function_t parallelize_2d_tile_2d = &thread_parallelize_2d_tile_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_2d_tile_2d = + &pthreadpool_thread_parallelize_2d_tile_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d_tile_2d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= tile_i && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_j <= tile_j) { + function(context, /*index_i=*/0, /*index_j=*/0, range_i, range_j); + } else { + for (size_t index_i = 0; index_i < range_i; index_i += tile_i) { + function(context, index_i, /*index_j=*/0, + min(tile_i, range_i - index_i), range_j); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = tile_range_i * tile_range_j; + const struct pthreadpool_2d_tile_2d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .tile_i = tile_i, + .tile_j = tile_j, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_2d_tile_2d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= tile_i && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_j <= tile_j) { + function(context, uarch_index, /*index_i=*/0, /*index_j=*/0, range_i, + range_j); + } else { + for (size_t index_i = 0; index_i < range_i; index_i += tile_i) { + function(context, uarch_index, index_i, /*index_j=*/0, + min(tile_i, range_i - index_i), range_j); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = tile_range_i * tile_range_j; + const struct pthreadpool_2d_tile_2d_dynamic_with_uarch_params params = { + .range_i = range_i, + .range_j = range_j, + .tile_i = tile_i, + .tile_j = tile_j, + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + }; + pthreadpool_parallelize( + threadpool, thread_parallelize_2d_tile_2d_dynamic_with_uarch, ¶ms, + sizeof(params), function, context, tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= tile_i && range_j <= tile_j)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, uarch_index, i, j, min(range_i - i, tile_i), + min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range = tile_range_i * tile_range_j; + const struct pthreadpool_2d_tile_2d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_i = range_i, + .tile_i = tile_i, + .range_j = range_j, + .tile_j = tile_j, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + }; + thread_function_t parallelize_2d_tile_2d_with_uarch = + &thread_parallelize_2d_tile_2d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_2d_tile_2d_with_uarch = + &pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_2d_tile_2d_with_uarch, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d(pthreadpool_t threadpool, + pthreadpool_task_3d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j | range_k) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + function(context, i, j, k); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range = range_i * range_j * range_k; + const struct pthreadpool_3d_params params = { + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + }; + thread_function_t parallelize_3d = &thread_parallelize_3d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d, ¶ms, sizeof(params), + function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_k, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, i, j, k, min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * range_j * tile_range_k; + const struct pthreadpool_3d_tile_1d_params params = { + .range_k = range_k, + .tile_k = tile_k, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_1d = + &pthreadpool_thread_parallelize_3d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_thread_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, 0, i, j, k, min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * range_j * tile_range_k; + const struct pthreadpool_3d_tile_1d_params params = { + .range_k = range_k, + .tile_k = tile_k, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_1d_with_thread = + &thread_parallelize_3d_tile_1d_with_thread; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_1d_with_thread = + &pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d_tile_1d_with_thread, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, uarch_index, i, j, k, min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * range_j * tile_range_k; + const struct pthreadpool_3d_tile_1d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_k = range_k, + .tile_k = tile_k, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_1d_with_uarch = + &thread_parallelize_3d_tile_1d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_1d_with_uarch = + &pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d_tile_1d_with_uarch, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t tile_k, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, uarch_index, 0, i, j, k, min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * range_j * tile_range_k; + const struct pthreadpool_3d_tile_1d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_k = range_k, + .tile_k = tile_k, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_1d_with_uarch_with_thread = + &thread_parallelize_3d_tile_1d_with_uarch_with_thread; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_1d_with_uarch_with_thread = + &pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath; + } +#endif + pthreadpool_parallelize( + threadpool, parallelize_3d_tile_1d_with_uarch_with_thread, ¶ms, + sizeof(params), function, context, tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, i, j, k, min(range_j - j, tile_j), + min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * tile_range_j * tile_range_k; + const struct pthreadpool_3d_tile_2d_params params = { + .range_j = range_j, + .tile_j = tile_j, + .range_k = range_k, + .tile_k = tile_k, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_2d = &thread_parallelize_3d_tile_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_2d = + &pthreadpool_thread_parallelize_3d_tile_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d_tile_2d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_k <= tile_k) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, index_i, /*index_j=*/0, /*index_k=*/0, range_j, + range_k); + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j += tile_j) { + function(context, index_i, index_j, /*index_k=*/0, + min(tile_j, range_j - index_j), range_k); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * tile_range_j * tile_range_k; + const struct pthreadpool_3d_tile_2d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .tile_j = tile_j, + .tile_k = tile_k, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_3d_tile_2d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t tile_j, size_t tile_k, + uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_k <= tile_k) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, uarch_index, index_i, /*index_j=*/0, /*index_k=*/0, + range_j, range_k); + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j += tile_j) { + function(context, uarch_index, index_i, index_j, /*index_k=*/0, + min(tile_j, range_j - index_j), range_k); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * tile_range_j * tile_range_k; + const struct pthreadpool_3d_tile_2d_dynamic_with_uarch_params params = { + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .tile_j = tile_j, + .tile_k = tile_k, + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + }; + pthreadpool_parallelize( + threadpool, thread_parallelize_3d_tile_2d_dynamic_with_uarch, ¶ms, + sizeof(params), function, context, tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_j, + size_t tile_k, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i <= 1 && range_j <= tile_j && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, uarch_index, i, j, k, min(range_j - j, tile_j), + min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * tile_range_j * tile_range_k; + const struct pthreadpool_3d_tile_2d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_j = range_j, + .tile_j = tile_j, + .range_k = range_k, + .tile_k = tile_k, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_2d_with_uarch = + &thread_parallelize_3d_tile_2d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_2d_with_uarch = + &pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_3d_tile_2d_with_uarch, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d(pthreadpool_t threadpool, + pthreadpool_task_4d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j | range_k | range_l) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + function(context, i, j, k, l); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_kl = range_k * range_l; + const size_t range = range_i * range_j * range_kl; + const struct pthreadpool_4d_params params = { + .range_k = range_k, + .range_j = fxdiv_init_size_t(range_j), + .range_kl = fxdiv_init_size_t(range_kl), + .range_l = fxdiv_init_size_t(range_l), + }; + thread_function_t parallelize_4d = &thread_parallelize_4d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_4d, ¶ms, sizeof(params), + function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_l, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, i, j, k, l, min(range_l - l, tile_l)); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_kl = range_k * tile_range_l; + const size_t tile_range = range_i * range_j * tile_range_kl; + const struct pthreadpool_4d_tile_1d_params params = { + .range_k = range_k, + .range_l = range_l, + .tile_l = tile_l, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_kl = fxdiv_init_size_t(tile_range_kl), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + }; + thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_4d_tile_1d = + &pthreadpool_thread_parallelize_4d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_4d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, i, j, k, l, min(range_k - k, tile_k), + min(range_l - l, tile_l)); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_kl = + divide_round_up(range_k, tile_k) * tile_range_l; + const size_t tile_range = range_i * range_j * tile_range_kl; + const struct pthreadpool_4d_tile_2d_params params = { + .range_k = range_k, + .tile_k = tile_k, + .range_l = range_l, + .tile_l = tile_l, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_kl = fxdiv_init_size_t(tile_range_kl), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + }; + thread_function_t parallelize_4d_tile_2d = &thread_parallelize_4d_tile_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_4d_tile_2d = + &pthreadpool_thread_parallelize_4d_tile_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_4d_tile_2d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t range_l, + size_t tile_k, size_t tile_l, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j) <= 1 && range_k <= tile_k && range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, uarch_index, i, j, k, l, min(range_k - k, tile_k), + min(range_l - l, tile_l)); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_kl = + divide_round_up(range_k, tile_k) * tile_range_l; + const size_t tile_range = range_i * range_j * tile_range_kl; + const struct pthreadpool_4d_tile_2d_with_uarch_params params = { + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + .range_k = range_k, + .tile_k = tile_k, + .range_l = range_l, + .tile_l = tile_l, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_kl = fxdiv_init_size_t(tile_range_kl), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + }; + thread_function_t parallelize_4d_tile_2d_with_uarch = + &thread_parallelize_4d_tile_2d_with_uarch; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_4d_tile_2d_with_uarch = + &pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_4d_tile_2d_with_uarch, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= 1 && range_k <= tile_k && + range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_l <= tile_l) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + function(context, index_i, index_j, /*index_k=*/0, /*index_l=*/0, + range_k, range_l); + } + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + for (size_t index_k = 0; index_k < range_k; index_k += tile_k) { + function(context, index_i, index_j, index_k, /*index_l=*/0, + min(tile_k, range_k - index_k), range_l); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range = range_i * range_j * tile_range_k * tile_range_l; + const struct pthreadpool_4d_tile_2d_dynamic_params params = { + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .range_l = range_l, + .tile_k = tile_k, + .tile_l = tile_l, + }; + pthreadpool_parallelize(threadpool, thread_parallelize_4d_tile_2d_dynamic, + ¶ms, sizeof(params), function, context, + tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_4d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t range_l, size_t tile_k, + size_t tile_l, uint32_t flags) { + if (threadpool == NULL || threadpool->threads_count.value <= 1 || + (range_i <= 1 && range_j <= 1 && range_k <= tile_k && + range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + uint32_t uarch_index = default_uarch_index; +#if PTHREADPOOL_USE_CPUINFO + uarch_index = + cpuinfo_get_current_uarch_index_with_default(default_uarch_index); + if (uarch_index > max_uarch_index) { + uarch_index = default_uarch_index; + } +#endif + + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + if (range_l <= tile_l) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + function(context, uarch_index, index_i, index_j, /*index_k=*/0, + /*index_l=*/0, range_k, range_l); + } + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + for (size_t index_k = 0; index_k < range_k; index_k += tile_k) { + function(context, uarch_index, index_i, index_j, index_k, + /*index_l=*/0, min(tile_k, range_k - index_k), range_l); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range = range_i * range_j * tile_range_k * tile_range_l; + const struct pthreadpool_4d_tile_2d_dynamic_with_uarch_params params = { + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .range_l = range_l, + .tile_k = tile_k, + .tile_l = tile_l, + .default_uarch_index = default_uarch_index, + .max_uarch_index = max_uarch_index, + }; + pthreadpool_parallelize( + threadpool, thread_parallelize_4d_tile_2d_dynamic_with_uarch, ¶ms, + sizeof(params), function, context, tile_range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d(pthreadpool_t threadpool, + pthreadpool_task_5d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j | range_k | range_l | range_m) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + function(context, i, j, k, l, m); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_lm = range_l * range_m; + const size_t range = range_i * range_j * range_k * range_lm; + const struct pthreadpool_5d_params params = { + .range_l = range_l, + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + .range_lm = fxdiv_init_size_t(range_lm), + .range_m = fxdiv_init_size_t(range_m), + }; + thread_function_t parallelize_5d = &thread_parallelize_5d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_5d, ¶ms, sizeof(params), + function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_5d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t tile_m, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + function(context, i, j, k, l, m, min(range_m - m, tile_m)); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_m = divide_round_up(range_m, tile_m); + const size_t range_kl = range_k * range_l; + const size_t tile_range = range_i * range_j * range_kl * tile_range_m; + const struct pthreadpool_5d_tile_1d_params params = { + .range_k = range_k, + .range_m = range_m, + .tile_m = tile_m, + .range_j = fxdiv_init_size_t(range_j), + .range_kl = fxdiv_init_size_t(range_kl), + .range_l = fxdiv_init_size_t(range_l), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + }; + thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_5d_tile_1d = + &pthreadpool_thread_parallelize_5d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_5d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_5d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t tile_l, size_t tile_m, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j | range_k) <= 1 && range_l <= tile_l && + range_m <= tile_m)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + for (size_t m = 0; m < range_m; m += tile_m) { + function(context, i, j, k, l, m, min(range_l - l, tile_l), + min(range_m - m, tile_m)); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_m = divide_round_up(range_m, tile_m); + const size_t tile_range_lm = + divide_round_up(range_l, tile_l) * tile_range_m; + const size_t tile_range = range_i * range_j * range_k * tile_range_lm; + const struct pthreadpool_5d_tile_2d_params params = { + .range_l = range_l, + .tile_l = tile_l, + .range_m = range_m, + .tile_m = tile_m, + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_lm = fxdiv_init_size_t(tile_range_lm), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + }; + thread_function_t parallelize_5d_tile_2d = &thread_parallelize_5d_tile_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_5d_tile_2d = + &pthreadpool_thread_parallelize_5d_tile_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_5d_tile_2d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d( + pthreadpool_t threadpool, pthreadpool_task_6d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, size_t range_l, + size_t range_m, size_t range_n, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + (range_i | range_j | range_k | range_l | range_m | range_n) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + for (size_t n = 0; n < range_n; n++) { + function(context, i, j, k, l, m, n); + } + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_lmn = range_l * range_m * range_n; + const size_t range = range_i * range_j * range_k * range_lmn; + const struct pthreadpool_6d_params params = { + .range_l = range_l, + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + .range_lmn = fxdiv_init_size_t(range_lmn), + .range_m = fxdiv_init_size_t(range_m), + .range_n = fxdiv_init_size_t(range_n), + }; + thread_function_t parallelize_6d = &thread_parallelize_6d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_6d = &pthreadpool_thread_parallelize_6d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_6d, ¶ms, sizeof(params), + function, context, range, flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_6d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t range_n, size_t tile_n, + uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j | range_k | range_l | range_m) <= 1 && + range_n <= tile_n)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + for (size_t n = 0; n < range_n; n += tile_n) { + function(context, i, j, k, l, m, n, min(range_n - n, tile_n)); + } + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_n = divide_round_up(range_n, tile_n); + const size_t tile_range_lmn = range_l * range_m * tile_range_n; + const size_t tile_range = range_i * range_j * range_k * tile_range_lmn; + const struct pthreadpool_6d_tile_1d_params params = { + .range_l = range_l, + .range_n = range_n, + .tile_n = tile_n, + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_lmn = fxdiv_init_size_t(tile_range_lmn), + .range_m = fxdiv_init_size_t(range_m), + .tile_range_n = fxdiv_init_size_t(tile_range_n), + }; + thread_function_t parallelize_6d_tile_1d = &thread_parallelize_6d_tile_1d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_6d_tile_1d = + &pthreadpool_thread_parallelize_6d_tile_1d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_6d_tile_1d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_6d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t range_n, size_t tile_m, + size_t tile_n, uint32_t flags) { + size_t threads_count; + if (threadpool == NULL || + (threads_count = threadpool->threads_count.value) <= 1 || + ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m && + range_n <= tile_n)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + for (size_t n = 0; n < range_n; n += tile_n) { + function(context, i, j, k, l, m, n, min(range_m - m, tile_m), + min(range_n - n, tile_n)); + } + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_kl = range_k * range_l; + const size_t tile_range_n = divide_round_up(range_n, tile_n); + const size_t tile_range_mn = + divide_round_up(range_m, tile_m) * tile_range_n; + const size_t tile_range = range_i * range_j * range_kl * tile_range_mn; + const struct pthreadpool_6d_tile_2d_params params = { + .range_k = range_k, + .range_m = range_m, + .tile_m = tile_m, + .range_n = range_n, + .tile_n = tile_n, + .range_j = fxdiv_init_size_t(range_j), + .range_kl = fxdiv_init_size_t(range_kl), + .range_l = fxdiv_init_size_t(range_l), + .tile_range_mn = fxdiv_init_size_t(tile_range_mn), + .tile_range_n = fxdiv_init_size_t(tile_range_n), + }; + thread_function_t parallelize_6d_tile_2d = &thread_parallelize_6d_tile_2d; +#if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_6d_tile_2d = + &pthreadpool_thread_parallelize_6d_tile_2d_fastpath; + } +#endif + pthreadpool_parallelize(threadpool, parallelize_6d_tile_2d, ¶ms, + sizeof(params), function, context, tile_range, + flags); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d_tile_2d) diff --git a/src/pthreads.c b/src/pthreads.c index cdead94..70291c6 100644 --- a/src/pthreads.c +++ b/src/pthreads.c @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// Needed for syscall. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif // _GNU_SOURCE + /* Standard C headers */ #include #include @@ -7,6 +21,7 @@ #include /* Configuration header */ +#include #include "threadpool-common.h" /* POSIX headers */ @@ -15,35 +30,38 @@ /* Futex-specific headers */ #if PTHREADPOOL_USE_FUTEX - #if defined(__linux__) - #include - #include - - /* Old Android NDKs do not define SYS_futex and FUTEX_PRIVATE_FLAG */ - #ifndef SYS_futex - #define SYS_futex __NR_futex - #endif - #ifndef FUTEX_PRIVATE_FLAG - #define FUTEX_PRIVATE_FLAG 128 - #endif - #elif defined(__EMSCRIPTEN__) - /* math.h for INFINITY constant */ - #include - - #include - #else - #error "Platform-specific implementation of futex_wait and futex_wake_all required" - #endif -#endif +#if defined(__linux__) +#include +#include + +/* Old Android NDKs do not define SYS_futex and FUTEX_PRIVATE_FLAG */ +#ifndef SYS_futex +#define SYS_futex __NR_futex +#endif // SYS_futex + +#ifndef FUTEX_PRIVATE_FLAG +#define FUTEX_PRIVATE_FLAG 128 +#endif // FUTEX_PRIVATE_FLAG + +#elif defined(__EMSCRIPTEN__) +/* math.h for INFINITY constant */ +#include +#include + +#else +#error \ + "Platform-specific implementation of futex_wait and futex_wake_all required" +#endif // defined(__linux__) +#endif // PTHREADPOOL_USE_FUTEX /* Windows-specific headers */ #ifdef _WIN32 - #include +#include #endif /* Dependencies */ #if PTHREADPOOL_USE_CPUINFO - #include +#include #endif /* Public library header */ @@ -54,408 +72,440 @@ #include "threadpool-object.h" #include "threadpool-utils.h" - #if PTHREADPOOL_USE_FUTEX - #if defined(__linux__) - static int futex_wait(pthreadpool_atomic_uint32_t* address, uint32_t value) { - return syscall(SYS_futex, address, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, value, NULL); - } - - static int futex_wake_all(pthreadpool_atomic_uint32_t* address) { - return syscall(SYS_futex, address, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT_MAX); - } - #elif defined(__EMSCRIPTEN__) - static int futex_wait(pthreadpool_atomic_uint32_t* address, uint32_t value) { - return emscripten_futex_wait((volatile void*) address, value, INFINITY); - } - - static int futex_wake_all(pthreadpool_atomic_uint32_t* address) { - return emscripten_futex_wake((volatile void*) address, INT_MAX); - } - #else - #error "Platform-specific implementation of futex_wait and futex_wake_all required" - #endif +#if defined(__linux__) +static int futex_wait(pthreadpool_atomic_uint32_t* address, uint32_t value) { + return syscall(SYS_futex, address, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, value, + NULL); +} + +static int futex_wake_all(pthreadpool_atomic_uint32_t* address) { + return syscall(SYS_futex, address, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, INT_MAX); +} +#elif defined(__EMSCRIPTEN__) +static int futex_wait(pthreadpool_atomic_uint32_t* address, uint32_t value) { + return emscripten_futex_wait((volatile void*)address, value, INFINITY); +} + +static int futex_wake_all(pthreadpool_atomic_uint32_t* address) { + return emscripten_futex_wake((volatile void*)address, INT_MAX); +} +#else +#error \ + "Platform-specific implementation of futex_wait and futex_wake_all required" +#endif #endif static void checkin_worker_thread(struct pthreadpool* threadpool) { - #if PTHREADPOOL_USE_FUTEX - if (pthreadpool_decrement_fetch_acquire_release_size_t(&threadpool->active_threads) == 0) { - pthreadpool_store_release_uint32_t(&threadpool->has_active_threads, 0); - futex_wake_all(&threadpool->has_active_threads); - } - #else - pthread_mutex_lock(&threadpool->completion_mutex); - if (pthreadpool_decrement_fetch_release_size_t(&threadpool->active_threads) == 0) { - pthread_cond_signal(&threadpool->completion_condvar); - } - pthread_mutex_unlock(&threadpool->completion_mutex); - #endif +#if PTHREADPOOL_USE_FUTEX + if (pthreadpool_decrement_fetch_acquire_release_size_t( + &threadpool->active_threads) == 0) { + pthreadpool_store_release_uint32_t(&threadpool->has_active_threads, 0); + futex_wake_all(&threadpool->has_active_threads); + } +#else + pthread_mutex_lock(&threadpool->completion_mutex); + if (pthreadpool_decrement_fetch_release_size_t(&threadpool->active_threads) == + 0) { + pthread_cond_signal(&threadpool->completion_condvar); + } + pthread_mutex_unlock(&threadpool->completion_mutex); +#endif } static void wait_worker_threads(struct pthreadpool* threadpool) { - /* Initial check */ - #if PTHREADPOOL_USE_FUTEX - uint32_t has_active_threads = pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads); - if (has_active_threads == 0) { - return; - } - #else - size_t active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads); - if (active_threads == 0) { - return; - } - #endif - - /* Spin-wait */ - for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - pthreadpool_yield(); - - #if PTHREADPOOL_USE_FUTEX - has_active_threads = pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads); - if (has_active_threads == 0) { - return; - } - #else - active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads); - if (active_threads == 0) { - return; - } - #endif - } - - /* Fall-back to mutex/futex wait */ - #if PTHREADPOOL_USE_FUTEX - while ((has_active_threads = pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads)) != 0) { - futex_wait(&threadpool->has_active_threads, 1); - } - #else - pthread_mutex_lock(&threadpool->completion_mutex); - while (pthreadpool_load_acquire_size_t(&threadpool->active_threads) != 0) { - pthread_cond_wait(&threadpool->completion_condvar, &threadpool->completion_mutex); - }; - pthread_mutex_unlock(&threadpool->completion_mutex); - #endif +/* Initial check */ +#if PTHREADPOOL_USE_FUTEX + uint32_t has_active_threads = + pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads); + if (has_active_threads == 0) { + return; + } +#else + size_t active_threads = + pthreadpool_load_acquire_size_t(&threadpool->active_threads); + if (active_threads == 0) { + return; + } +#endif + + /* Spin-wait */ + for (uint32_t i = 0; i < PTHREADPOOL_SPIN_WAIT_ITERATIONS; i++) { + pthreadpool_yield(i); + +#if PTHREADPOOL_USE_FUTEX + has_active_threads = + pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads); + if (has_active_threads == 0) { + return; + } +#else + active_threads = + pthreadpool_load_acquire_size_t(&threadpool->active_threads); + if (active_threads == 0) { + return; + } +#endif + } + +/* Fall-back to mutex/futex wait */ +#if PTHREADPOOL_USE_FUTEX + while ((has_active_threads = pthreadpool_load_acquire_uint32_t( + &threadpool->has_active_threads)) != 0) { + futex_wait(&threadpool->has_active_threads, 1); + } +#else + pthread_mutex_lock(&threadpool->completion_mutex); + while (pthreadpool_load_acquire_size_t(&threadpool->active_threads) != 0) { + pthread_cond_wait(&threadpool->completion_condvar, + &threadpool->completion_mutex); + }; + pthread_mutex_unlock(&threadpool->completion_mutex); +#endif } -static uint32_t wait_for_new_command( - struct pthreadpool* threadpool, - uint32_t last_command, - uint32_t last_flags) -{ - uint32_t command = pthreadpool_load_acquire_uint32_t(&threadpool->command); - if (command != last_command) { - return command; - } - - if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { - /* Spin-wait loop */ - for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - pthreadpool_yield(); - - command = pthreadpool_load_acquire_uint32_t(&threadpool->command); - if (command != last_command) { - return command; - } - } - } - - /* Spin-wait disabled or timed out, fall back to mutex/futex wait */ - #if PTHREADPOOL_USE_FUTEX - do { - futex_wait(&threadpool->command, last_command); - command = pthreadpool_load_acquire_uint32_t(&threadpool->command); - } while (command == last_command); - #else - /* Lock the command mutex */ - pthread_mutex_lock(&threadpool->command_mutex); - /* Read the command */ - while ((command = pthreadpool_load_acquire_uint32_t(&threadpool->command)) == last_command) { - /* Wait for new command */ - pthread_cond_wait(&threadpool->command_condvar, &threadpool->command_mutex); - } - /* Read a new command */ - pthread_mutex_unlock(&threadpool->command_mutex); - #endif - return command; +static uint32_t wait_for_new_command(struct pthreadpool* threadpool, + uint32_t last_command, + uint32_t last_flags) { + uint32_t command = pthreadpool_load_acquire_uint32_t(&threadpool->command); + if (command != last_command) { + return command; + } + + if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { + /* Spin-wait loop */ + for (uint32_t i = 0; i < PTHREADPOOL_SPIN_WAIT_ITERATIONS; i++) { + pthreadpool_yield(i); + + command = pthreadpool_load_acquire_uint32_t(&threadpool->command); + if (command != last_command) { + return command; + } + } + } + +/* Spin-wait disabled or timed out, fall back to mutex/futex wait */ +#if PTHREADPOOL_USE_FUTEX + do { + futex_wait(&threadpool->command, last_command); + command = pthreadpool_load_acquire_uint32_t(&threadpool->command); + } while (command == last_command); +#else + /* Lock the command mutex */ + pthread_mutex_lock(&threadpool->command_mutex); + /* Read the command */ + while ((command = pthreadpool_load_acquire_uint32_t(&threadpool->command)) == + last_command) { + /* Wait for new command */ + pthread_cond_wait(&threadpool->command_condvar, &threadpool->command_mutex); + } + /* Read a new command */ + pthread_mutex_unlock(&threadpool->command_mutex); +#endif + return command; } static void* thread_main(void* arg) { - struct thread_info* thread = (struct thread_info*) arg; - struct pthreadpool* threadpool = thread->threadpool; - uint32_t last_command = threadpool_command_init; - struct fpu_state saved_fpu_state = { 0 }; - uint32_t flags = 0; - - /* Check in */ - checkin_worker_thread(threadpool); - - /* Monitor new commands and act accordingly */ - for (;;) { - uint32_t command = wait_for_new_command(threadpool, last_command, flags); - pthreadpool_fence_acquire(); - - flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); - - /* Process command */ - switch (command & THREADPOOL_COMMAND_MASK) { - case threadpool_command_parallelize: - { - const thread_function_t thread_function = - (thread_function_t) pthreadpool_load_relaxed_void_p(&threadpool->thread_function); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - - thread_function(threadpool, thread); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - break; - } - case threadpool_command_shutdown: - /* Exit immediately: the master thread is waiting on pthread_join */ - return NULL; - case threadpool_command_init: - /* To inhibit compiler warning */ - break; - } - /* Notify the master thread that we finished processing */ - checkin_worker_thread(threadpool); - /* Update last command */ - last_command = command; - }; + struct thread_info* thread = (struct thread_info*)arg; + struct pthreadpool* threadpool = thread->threadpool; + uint32_t last_command = threadpool_command_init; + struct fpu_state saved_fpu_state = {0}; + uint32_t flags = 0; + + /* Check in */ + checkin_worker_thread(threadpool); + + /* Monitor new commands and act accordingly */ + for (;;) { + uint32_t command = wait_for_new_command(threadpool, last_command, flags); + pthreadpool_fence_acquire(); + + flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); + + /* Process command */ + switch (command & THREADPOOL_COMMAND_MASK) { + case threadpool_command_parallelize: { + const thread_function_t thread_function = + (thread_function_t)pthreadpool_load_relaxed_void_p( + &threadpool->thread_function); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + + thread_function(threadpool, thread); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + break; + } + case threadpool_command_shutdown: + /* Exit immediately: the master thread is waiting on pthread_join */ + return NULL; + case threadpool_command_init: + /* To inhibit compiler warning */ + break; + } + /* Notify the master thread that we finished processing */ + checkin_worker_thread(threadpool); + /* Update last command */ + last_command = command; + }; } -struct pthreadpool* pthreadpool_create(size_t threads_count) { - #if PTHREADPOOL_USE_CPUINFO - if (!cpuinfo_initialize()) { - return NULL; - } - #endif - - if (threads_count == 0) { - #if PTHREADPOOL_USE_CPUINFO - threads_count = cpuinfo_get_processors_count(); - #elif defined(_SC_NPROCESSORS_ONLN) - threads_count = (size_t) sysconf(_SC_NPROCESSORS_ONLN); - #if defined(__EMSCRIPTEN_PTHREADS__) - /* Limit the number of threads to 8 to match link-time PTHREAD_POOL_SIZE option */ - if (threads_count >= 8) { - threads_count = 8; - } - #endif - #elif defined(_WIN32) - SYSTEM_INFO system_info; - ZeroMemory(&system_info, sizeof(system_info)); - GetSystemInfo(&system_info); - threads_count = (size_t) system_info.dwNumberOfProcessors; - #else - #error "Platform-specific implementation of sysconf(_SC_NPROCESSORS_ONLN) required" - #endif - } - - struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); - if (threadpool == NULL) { - return NULL; - } - threadpool->threads_count = fxdiv_init_size_t(threads_count); - for (size_t tid = 0; tid < threads_count; tid++) { - threadpool->threads[tid].thread_number = tid; - threadpool->threads[tid].threadpool = threadpool; - } - - /* Thread pool with a single thread computes everything on the caller thread. */ - if (threads_count > 1) { - pthread_mutex_init(&threadpool->execution_mutex, NULL); - #if !PTHREADPOOL_USE_FUTEX - pthread_mutex_init(&threadpool->completion_mutex, NULL); - pthread_cond_init(&threadpool->completion_condvar, NULL); - pthread_mutex_init(&threadpool->command_mutex, NULL); - pthread_cond_init(&threadpool->command_condvar, NULL); - #endif - - #if PTHREADPOOL_USE_FUTEX - pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); - #endif - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count - 1 /* caller thread */); - - /* Caller thread serves as worker #0. Thus, we create system threads starting with worker #1. */ - for (size_t tid = 1; tid < threads_count; tid++) { - pthread_create(&threadpool->threads[tid].thread_object, NULL, &thread_main, &threadpool->threads[tid]); - } - - /* Wait until all threads initialize */ - wait_worker_threads(threadpool); - } - return threadpool; +PTHREADPOOL_WEAK struct pthreadpool* pthreadpool_create(size_t threads_count) { +#if PTHREADPOOL_USE_CPUINFO + if (!cpuinfo_initialize()) { + return NULL; + } +#endif + + if (threads_count == 0) { +#if PTHREADPOOL_USE_CPUINFO + threads_count = cpuinfo_get_processors_count(); +#elif defined(_SC_NPROCESSORS_ONLN) + threads_count = (size_t)sysconf(_SC_NPROCESSORS_ONLN); +#if defined(__EMSCRIPTEN_PTHREADS__) + /* Limit the number of threads to 8 to match link-time PTHREAD_POOL_SIZE + * option */ + if (threads_count >= 8) { + threads_count = 8; + } +#endif +#elif defined(_WIN32) + SYSTEM_INFO system_info; + ZeroMemory(&system_info, sizeof(system_info)); + GetSystemInfo(&system_info); + threads_count = (size_t)system_info.dwNumberOfProcessors; +#else +#error \ + "Platform-specific implementation of sysconf(_SC_NPROCESSORS_ONLN) required" +#endif + } + + struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); + if (threadpool == NULL) { + return NULL; + } + threadpool->threads_count = fxdiv_init_size_t(threads_count); + for (size_t tid = 0; tid < threads_count; tid++) { + threadpool->threads[tid].thread_number = tid; + threadpool->threads[tid].threadpool = threadpool; + } + + /* Thread pool with a single thread computes everything on the caller thread. + */ + if (threads_count > 1) { + pthread_mutex_init(&threadpool->execution_mutex, NULL); +#if !PTHREADPOOL_USE_FUTEX + pthread_mutex_init(&threadpool->completion_mutex, NULL); + pthread_cond_init(&threadpool->completion_condvar, NULL); + pthread_mutex_init(&threadpool->command_mutex, NULL); + pthread_cond_init(&threadpool->command_condvar, NULL); +#endif + +#if PTHREADPOOL_USE_FUTEX + pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); +#endif + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count - 1 /* caller thread */); + + /* Caller thread serves as worker #0. Thus, we create system threads + * starting with worker #1. */ + for (size_t tid = 1; tid < threads_count; tid++) { + pthread_create(&threadpool->threads[tid].thread_object, NULL, + &thread_main, &threadpool->threads[tid]); + } + + /* Wait until all threads initialize */ + wait_worker_threads(threadpool); + } + return threadpool; } +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_create) + PTHREADPOOL_INTERNAL void pthreadpool_parallelize( - struct pthreadpool* threadpool, - thread_function_t thread_function, - const void* params, - size_t params_size, - void* task, - void* context, - size_t linear_range, - uint32_t flags) -{ - assert(threadpool != NULL); - assert(thread_function != NULL); - assert(task != NULL); - assert(linear_range > 1); - - /* Protect the global threadpool structures */ - pthread_mutex_lock(&threadpool->execution_mutex); - - #if !PTHREADPOOL_USE_FUTEX - /* Lock the command variables to ensure that threads don't start processing before they observe complete command with all arguments */ - pthread_mutex_lock(&threadpool->command_mutex); - #endif - - /* Setup global arguments */ - pthreadpool_store_relaxed_void_p(&threadpool->thread_function, (void*) thread_function); - pthreadpool_store_relaxed_void_p(&threadpool->task, task); - pthreadpool_store_relaxed_void_p(&threadpool->argument, context); - pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); - - /* Locking of completion_mutex not needed: readers are sleeping on command_condvar */ - const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count.value - 1 /* caller thread */); - #if PTHREADPOOL_USE_FUTEX - pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); - #endif - - if (params_size != 0) { - memcpy(&threadpool->params, params, params_size); - pthreadpool_fence_release(); - } - - /* Spread the work between threads */ - const struct fxdiv_result_size_t range_params = fxdiv_divide_size_t(linear_range, threads_count); - size_t range_start = 0; - for (size_t tid = 0; tid < threads_count.value; tid++) { - struct thread_info* thread = &threadpool->threads[tid]; - const size_t range_length = range_params.quotient + (size_t) (tid < range_params.remainder); - const size_t range_end = range_start + range_length; - pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); - pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); - pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); - - /* The next subrange starts where the previous ended */ - range_start = range_end; - } - - /* - * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, task, argument, flags) - * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask - * to ensure the unmasked command is different then the last command, because worker threads - * monitor for change in the unmasked command. - */ - const uint32_t old_command = pthreadpool_load_relaxed_uint32_t(&threadpool->command); - const uint32_t new_command = ~(old_command | THREADPOOL_COMMAND_MASK) | threadpool_command_parallelize; - - /* - * Store the command with release semantics to guarantee that if a worker thread observes - * the new command value, it also observes the updated command parameters. - * - * Note: release semantics is necessary even with a conditional variable, because the workers might - * be waiting in a spin-loop rather than the conditional variable. - */ - pthreadpool_store_release_uint32_t(&threadpool->command, new_command); - #if PTHREADPOOL_USE_FUTEX - /* Wake up the threads */ - futex_wake_all(&threadpool->command); - #else - /* Unlock the command variables before waking up the threads for better performance */ - pthread_mutex_unlock(&threadpool->command_mutex); - - /* Wake up the threads */ - pthread_cond_broadcast(&threadpool->command_condvar); - #endif - - /* Save and modify FPU denormals control, if needed */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - - /* Do computations as worker #0 */ - thread_function(threadpool, &threadpool->threads[0]); - - /* Restore FPU denormals control, if needed */ - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - - /* Wait until the threads finish computation */ - wait_worker_threads(threadpool); - - /* Make changes by other threads visible to this thread */ - pthreadpool_fence_acquire(); - - /* Unprotect the global threadpool structures */ - pthread_mutex_unlock(&threadpool->execution_mutex); + struct pthreadpool* threadpool, thread_function_t thread_function, + const void* params, size_t params_size, void* task, void* context, + size_t linear_range, uint32_t flags) { + assert(threadpool != NULL); + assert(thread_function != NULL); + assert(task != NULL); + assert(linear_range > 1); + + /* Protect the global threadpool structures */ + pthread_mutex_lock(&threadpool->execution_mutex); + +#if !PTHREADPOOL_USE_FUTEX + /* Lock the command variables to ensure that threads don't start processing + * before they observe complete command with all arguments */ + pthread_mutex_lock(&threadpool->command_mutex); +#endif + + /* Setup global arguments */ + pthreadpool_store_relaxed_void_p(&threadpool->thread_function, + (void*)thread_function); + pthreadpool_store_relaxed_void_p(&threadpool->task, task); + pthreadpool_store_relaxed_void_p(&threadpool->argument, context); + pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); + + /* Locking of completion_mutex not needed: readers are sleeping on + * command_condvar */ + const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count.value - 1 /* caller thread */); +#if PTHREADPOOL_USE_FUTEX + pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); +#endif + + if (params_size != 0) { + memcpy(&threadpool->params, params, params_size); + pthreadpool_fence_release(); + } + + /* Spread the work between threads */ + const struct fxdiv_result_size_t range_params = + fxdiv_divide_size_t(linear_range, threads_count); + size_t range_start = 0; + for (size_t tid = 0; tid < threads_count.value; tid++) { + struct thread_info* thread = &threadpool->threads[tid]; + const size_t range_length = + range_params.quotient + (size_t)(tid < range_params.remainder); + const size_t range_end = range_start + range_length; + pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); + pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); + pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); + + /* The next subrange starts where the previous ended */ + range_start = range_end; + } + + /* + * Update the threadpool command. + * Imporantly, do it after initializing command parameters (range, task, + * argument, flags) + * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in + * command mask to ensure the unmasked command is different then the last + * command, because worker threads monitor for change in the unmasked command. + */ + const uint32_t old_command = + pthreadpool_load_relaxed_uint32_t(&threadpool->command); + const uint32_t new_command = + ~(old_command | THREADPOOL_COMMAND_MASK) | threadpool_command_parallelize; + + /* + * Store the command with release semantics to guarantee that if a worker + * thread observes the new command value, it also observes the updated command + * parameters. + * + * Note: release semantics is necessary even with a conditional variable, + * because the workers might be waiting in a spin-loop rather than the + * conditional variable. + */ + pthreadpool_store_release_uint32_t(&threadpool->command, new_command); +#if PTHREADPOOL_USE_FUTEX + /* Wake up the threads */ + futex_wake_all(&threadpool->command); +#else + /* Unlock the command variables before waking up the threads for better + * performance */ + pthread_mutex_unlock(&threadpool->command_mutex); + + /* Wake up the threads */ + pthread_cond_broadcast(&threadpool->command_condvar); +#endif + + /* Save and modify FPU denormals control, if needed */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + + /* Do computations as worker #0 */ + thread_function(threadpool, &threadpool->threads[0]); + + /* Restore FPU denormals control, if needed */ + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + + /* Wait until the threads finish computation */ + wait_worker_threads(threadpool); + + /* Make changes by other threads visible to this thread */ + pthreadpool_fence_acquire(); + + /* Unprotect the global threadpool structures */ + pthread_mutex_unlock(&threadpool->execution_mutex); } -void pthreadpool_destroy(struct pthreadpool* threadpool) { - if (threadpool != NULL) { - const size_t threads_count = threadpool->threads_count.value; - if (threads_count > 1) { - #if PTHREADPOOL_USE_FUTEX - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count - 1 /* caller thread */); - pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); - - /* - * Store the command with release semantics to guarantee that if a worker thread observes - * the new command value, it also observes the updated active_threads/has_active_threads values. - */ - pthreadpool_store_release_uint32_t(&threadpool->command, threadpool_command_shutdown); - - /* Wake up worker threads */ - futex_wake_all(&threadpool->command); - #else - /* Lock the command variable to ensure that threads don't shutdown until both command and active_threads are updated */ - pthread_mutex_lock(&threadpool->command_mutex); - - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count - 1 /* caller thread */); - - /* - * Store the command with release semantics to guarantee that if a worker thread observes - * the new command value, it also observes the updated active_threads value. - * - * Note: the release fence inside pthread_mutex_unlock is insufficient, - * because the workers might be waiting in a spin-loop rather than the conditional variable. - */ - pthreadpool_store_release_uint32_t(&threadpool->command, threadpool_command_shutdown); - - /* Wake up worker threads */ - pthread_cond_broadcast(&threadpool->command_condvar); - - /* Commit the state changes and let workers start processing */ - pthread_mutex_unlock(&threadpool->command_mutex); - #endif - - /* Wait until all threads return */ - for (size_t thread = 1; thread < threads_count; thread++) { - pthread_join(threadpool->threads[thread].thread_object, NULL); - } - - /* Release resources */ - pthread_mutex_destroy(&threadpool->execution_mutex); - #if !PTHREADPOOL_USE_FUTEX - pthread_mutex_destroy(&threadpool->completion_mutex); - pthread_cond_destroy(&threadpool->completion_condvar); - pthread_mutex_destroy(&threadpool->command_mutex); - pthread_cond_destroy(&threadpool->command_condvar); - #endif - } - #if PTHREADPOOL_USE_CPUINFO - cpuinfo_deinitialize(); - #endif - pthreadpool_deallocate(threadpool); - } +PTHREADPOOL_WEAK void pthreadpool_destroy(struct pthreadpool* threadpool) { + if (threadpool != NULL) { + const size_t threads_count = threadpool->threads_count.value; + if (threads_count > 1) { +#if PTHREADPOOL_USE_FUTEX + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count - 1 /* caller thread */); + pthreadpool_store_relaxed_uint32_t(&threadpool->has_active_threads, 1); + + /* + * Store the command with release semantics to guarantee that if a worker + * thread observes the new command value, it also observes the updated + * active_threads/has_active_threads values. + */ + pthreadpool_store_release_uint32_t(&threadpool->command, + threadpool_command_shutdown); + + /* Wake up worker threads */ + futex_wake_all(&threadpool->command); +#else + /* Lock the command variable to ensure that threads don't shutdown until + * both command and active_threads are updated */ + pthread_mutex_lock(&threadpool->command_mutex); + + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count - 1 /* caller thread */); + + /* + * Store the command with release semantics to guarantee that if a worker + * thread observes the new command value, it also observes the updated + * active_threads value. + * + * Note: the release fence inside pthread_mutex_unlock is insufficient, + * because the workers might be waiting in a spin-loop rather than the + * conditional variable. + */ + pthreadpool_store_release_uint32_t(&threadpool->command, + threadpool_command_shutdown); + + /* Wake up worker threads */ + pthread_cond_broadcast(&threadpool->command_condvar); + + /* Commit the state changes and let workers start processing */ + pthread_mutex_unlock(&threadpool->command_mutex); +#endif + + /* Wait until all threads return */ + for (size_t thread = 1; thread < threads_count; thread++) { + pthread_join(threadpool->threads[thread].thread_object, NULL); + } + + /* Release resources */ + pthread_mutex_destroy(&threadpool->execution_mutex); +#if !PTHREADPOOL_USE_FUTEX + pthread_mutex_destroy(&threadpool->completion_mutex); + pthread_cond_destroy(&threadpool->completion_condvar); + pthread_mutex_destroy(&threadpool->command_mutex); + pthread_cond_destroy(&threadpool->command_condvar); +#endif + } +#if PTHREADPOOL_USE_CPUINFO + cpuinfo_deinitialize(); +#endif + pthreadpool_deallocate(threadpool); + } } + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_destroy) diff --git a/src/shim.c b/src/shim.c index 36f7c00..c886448 100644 --- a/src/shim.c +++ b/src/shim.c @@ -1,596 +1,616 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include +#include /* Public library header */ #include /* Internal library headers */ +#include "threadpool-common.h" #include "threadpool-utils.h" +struct pthreadpool {}; -struct pthreadpool { -}; - -static const struct pthreadpool static_pthreadpool = { }; - +static const struct pthreadpool static_pthreadpool = {}; struct pthreadpool* pthreadpool_create(size_t threads_count) { - if (threads_count <= 1) { - return (struct pthreadpool*) &static_pthreadpool; - } + if (threads_count <= 1) { + return (struct pthreadpool*)&static_pthreadpool; + } - return NULL; + return NULL; } size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { - return 1; -} - -void pthreadpool_parallelize_1d( - struct pthreadpool* threadpool, - pthreadpool_task_1d_t task, - void* argument, - size_t range, - uint32_t flags) -{ - for (size_t i = 0; i < range; i++) { - task(argument, i); - } -} - -void pthreadpool_parallelize_1d_with_thread( - struct pthreadpool* threadpool, - pthreadpool_task_1d_with_thread_t task, - void* argument, - size_t range, - uint32_t flags) -{ - for (size_t i = 0; i < range; i++) { - task(argument, 0, i); - } -} - -void pthreadpool_parallelize_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range, - uint32_t flags) -{ - for (size_t i = 0; i < range; i++) { - task(argument, default_uarch_index, i); - } -} - -void pthreadpool_parallelize_1d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_1d_tile_1d_t task, - void* argument, - size_t range, - size_t tile, - uint32_t flags) -{ - for (size_t i = 0; i < range; i += tile) { - task(argument, i, min(range - i, tile)); - } -} - -void pthreadpool_parallelize_2d( - struct pthreadpool* threadpool, - pthreadpool_task_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - task(argument, i, j); - } - } -} - -void pthreadpool_parallelize_2d_with_thread( - struct pthreadpool* threadpool, - pthreadpool_task_2d_with_thread_t task, - void* argument, - size_t range_i, - size_t range_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - task(argument, 0, i, j); - } - } -} - -void pthreadpool_parallelize_2d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, i, j, min(range_j - j, tile_j)); - } - } -} - -void pthreadpool_parallelize_2d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, default_uarch_index, i, j, min(range_j - j, tile_j)); - } - } -} - -void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_1d_with_id_with_thread_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, default_uarch_index, 0, i, j, min(range_j - j, tile_j)); - } - } -} - -void pthreadpool_parallelize_2d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); - } - } -} - -void pthreadpool_parallelize_2d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_2d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t tile_i, - size_t tile_j, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { - task(argument, default_uarch_index, i, j, - min(range_i - i, tile_i), min(range_j - j, tile_j)); - } - } -} - -void pthreadpool_parallelize_3d( - pthreadpool_t threadpool, - pthreadpool_task_3d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - task(argument, i, j, k); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, i, j, k, min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_thread_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, 0, i, j, k, min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, default_uarch_index, i, j, k, min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_1d_with_id_with_thread_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, default_uarch_index, 0, i, j, k, min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, i, j, k, - min(range_j - j, tile_j), min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_3d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_3d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t tile_j, - size_t tile_k, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j += tile_j) { - for (size_t k = 0; k < range_k; k += tile_k) { - task(argument, default_uarch_index, i, j, k, - min(range_j - j, tile_j), min(range_k - k, tile_k)); - } - } - } -} - -void pthreadpool_parallelize_4d( - pthreadpool_t threadpool, - pthreadpool_task_4d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - task(argument, i, j, k, l); - } - } - } - } -} - -void pthreadpool_parallelize_4d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_l, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, i, j, k, l, min(range_l - l, tile_l)); - } - } - } - } -} - -void pthreadpool_parallelize_4d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, i, j, k, l, - min(range_k - k, tile_k), min(range_l - l, tile_l)); - } - } - } - } -} - -void pthreadpool_parallelize_4d_tile_2d_with_uarch( - pthreadpool_t threadpool, - pthreadpool_task_4d_tile_2d_with_id_t task, - void* argument, - uint32_t default_uarch_index, - uint32_t max_uarch_index, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t tile_k, - size_t tile_l, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k += tile_k) { - for (size_t l = 0; l < range_l; l += tile_l) { - task(argument, default_uarch_index, i, j, k, l, - min(range_k - k, tile_k), min(range_l - l, tile_l)); - } - } - } - } -} - -void pthreadpool_parallelize_5d( - pthreadpool_t threadpool, - pthreadpool_task_5d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - task(argument, i, j, k, l, m); - } - } - } - } - } -} - -void pthreadpool_parallelize_5d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_m, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m += tile_m) { - task(argument, i, j, k, l, m, min(range_m - m, tile_m)); - } - } - } - } - } -} - -void pthreadpool_parallelize_5d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_5d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t tile_l, - size_t tile_m, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l += tile_l) { - for (size_t m = 0; m < range_m; m += tile_m) { - task(argument, i, j, k, l, m, - min(range_l - l, tile_l), min(range_m - m, tile_m)); - } - } - } - } - } -} - -void pthreadpool_parallelize_6d( - pthreadpool_t threadpool, - pthreadpool_task_6d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - for (size_t n = 0; n < range_n; n++) { - task(argument, i, j, k, l, m, n); - } - } - } - } - } - } -} - -void pthreadpool_parallelize_6d_tile_1d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_1d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_n, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m++) { - for (size_t n = 0; n < range_n; n += tile_n) { - task(argument, i, j, k, l, m, n, min(range_n - n, tile_n)); - } - } - } - } - } - } -} - -void pthreadpool_parallelize_6d_tile_2d( - pthreadpool_t threadpool, - pthreadpool_task_6d_tile_2d_t task, - void* argument, - size_t range_i, - size_t range_j, - size_t range_k, - size_t range_l, - size_t range_m, - size_t range_n, - size_t tile_m, - size_t tile_n, - uint32_t flags) -{ - for (size_t i = 0; i < range_i; i++) { - for (size_t j = 0; j < range_j; j++) { - for (size_t k = 0; k < range_k; k++) { - for (size_t l = 0; l < range_l; l++) { - for (size_t m = 0; m < range_m; m += tile_m) { - for (size_t n = 0; n < range_n; n += tile_n) { - task(argument, i, j, k, l, m, n, - min(range_m - m, tile_m), min(range_n - n, tile_n)); - } - } - } - } - } - } -} - -void pthreadpool_destroy(struct pthreadpool* threadpool) { + return 1; +} + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d(struct pthreadpool* threadpool, + pthreadpool_task_1d_t function, + void* context, size_t range, + uint32_t flags) { + for (size_t i = 0; i < range; i++) { + function(context, i); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_thread( + struct pthreadpool* threadpool, pthreadpool_task_1d_with_thread_t function, + void* context, size_t range, uint32_t flags) { + for (size_t i = 0; i < range; i++) { + function(context, 0, i); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range, uint32_t flags) { + for (size_t i = 0; i < range; i++) { + function(context, default_uarch_index, i); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_t function, + void* context, size_t range, size_t tile, uint32_t flags) { + for (size_t i = 0; i < range; i += tile) { + function(context, i, min(range - i, tile)); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_1d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_dynamic_t function, + void* context, size_t range, size_t tile, uint32_t flags) { + function(context, 0, range); +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_1d_tile_1d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d(struct pthreadpool* threadpool, + pthreadpool_task_2d_t function, + void* context, size_t range_i, + size_t range_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + function(context, i, j); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_with_thread( + struct pthreadpool* threadpool, pthreadpool_task_2d_with_thread_t function, + void* context, size_t range_i, size_t range_j, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + function(context, 0, i, j); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, i, j, min(range_j - j, tile_j)); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + function(context, i, 0, range_j); + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_j, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, default_uarch_index, i, j, min(range_j - j, tile_j)); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_j, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, default_uarch_index, 0, i, j, min(range_j - j, tile_j)); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, i, j, min(range_i - i, tile_i), + min(range_j - j, tile_j)); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + function(context, default_uarch_index, i, j, min(range_i - i, tile_i), + min(range_j - j, tile_j)); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, + uint32_t flags) { + if (range_j <= tile_j) { + function(context, /*index_i=*/0, /*index_j=*/0, range_i, range_j); + } else { + for (size_t index_i = 0; index_i < range_i; index_i += tile_i) { + function(context, index_i, /*index_j=*/0, min(tile_i, range_i - index_i), + range_j); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_2d_tile_2d_with_uarch_dynamic( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags) { + if (range_j <= tile_j) { + function(context, default_uarch_index, /*index_i=*/0, /*index_j=*/0, + range_i, range_j); + } else { + for (size_t index_i = 0; index_i < range_i; index_i += tile_i) { + function(context, default_uarch_index, index_i, /*index_j=*/0, + min(tile_i, range_i - index_i), range_j); + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_2d_tile_2d_with_uarch_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d(pthreadpool_t threadpool, + pthreadpool_task_3d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + function(context, i, j, k); + } + } + } } + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_k, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, i, j, k, min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_thread_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, 0, i, j, k, min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_k, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, default_uarch_index, i, j, k, + min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_with_id_with_thread_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t tile_k, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, default_uarch_index, 0, i, j, k, + min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, i, j, k, min(range_j - j, tile_j), + min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t tile_j, + size_t tile_k, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(context, default_uarch_index, i, j, k, + min(range_j - j, tile_j), min(range_k - k, tile_k)); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_3d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t tile_j, size_t tile_k, uint32_t flags) { + if (range_k <= tile_k) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + function(context, index_i, /*index_j=*/0, /*index_k=*/0, range_j, + range_k); + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j += tile_j) { + function(context, index_i, index_j, /*index_k=*/0, + min(tile_j, range_j - index_j), range_k); + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_3d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d(pthreadpool_t threadpool, + pthreadpool_task_4d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + function(context, i, j, k, l); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_l, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, i, j, k, l, min(range_l - l, tile_l)); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, i, j, k, l, min(range_k - k, tile_k), + min(range_l - l, tile_l)); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_with_uarch( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function, + void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, + size_t range_i, size_t range_j, size_t range_k, size_t range_l, + size_t tile_k, size_t tile_l, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(context, default_uarch_index, i, j, k, l, + min(range_k - k, tile_k), min(range_l - l, tile_l)); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic( + pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_dynamic_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags) { + if (range_l <= tile_l) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + function(context, index_i, index_j, /*index_k=*/0, /*index_l=*/0, + range_k, range_l); + } + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + for (size_t index_k = 0; index_k < range_k; index_k += tile_k) { + function(context, index_i, index_j, index_k, /*index_l=*/0, + min(tile_k, range_k - index_k), range_l); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_dynamic) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + pthreadpool_t threadpool, + pthreadpool_task_4d_tile_2d_dynamic_with_id_t function, void* context, + uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, + size_t range_j, size_t range_k, size_t range_l, size_t tile_k, + size_t tile_l, uint32_t flags) { + if (range_l <= tile_l) { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + function(context, default_uarch_index, index_i, index_j, /*index_k=*/0, + /*index_l=*/0, range_k, range_l); + } + } + } else { + for (size_t index_i = 0; index_i < range_i; index_i++) { + for (size_t index_j = 0; index_j < range_j; index_j++) { + for (size_t index_k = 0; index_k < range_k; index_k += tile_k) { + function(context, default_uarch_index, index_i, index_j, index_k, + /*index_l=*/0, min(tile_k, range_k - index_k), range_l); + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d(pthreadpool_t threadpool, + pthreadpool_task_5d_t function, + void* context, size_t range_i, + size_t range_j, size_t range_k, + size_t range_l, size_t range_m, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + function(context, i, j, k, l, m); + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_5d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t tile_m, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + function(context, i, j, k, l, m, min(range_m - m, tile_m)); + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_5d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t tile_l, size_t tile_m, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + for (size_t m = 0; m < range_m; m += tile_m) { + function(context, i, j, k, l, m, min(range_l - l, tile_l), + min(range_m - m, tile_m)); + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_5d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d( + pthreadpool_t threadpool, pthreadpool_task_6d_t function, void* context, + size_t range_i, size_t range_j, size_t range_k, size_t range_l, + size_t range_m, size_t range_n, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + for (size_t n = 0; n < range_n; n++) { + function(context, i, j, k, l, m, n); + } + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_1d( + pthreadpool_t threadpool, pthreadpool_task_6d_tile_1d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t range_n, size_t tile_n, + uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + for (size_t n = 0; n < range_n; n += tile_n) { + function(context, i, j, k, l, m, n, min(range_n - n, tile_n)); + } + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d_tile_1d) + +PTHREADPOOL_WEAK void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, pthreadpool_task_6d_tile_2d_t function, + void* context, size_t range_i, size_t range_j, size_t range_k, + size_t range_l, size_t range_m, size_t range_n, size_t tile_m, + size_t tile_n, uint32_t flags) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + for (size_t n = 0; n < range_n; n += tile_n) { + function(context, i, j, k, l, m, n, min(range_m - m, tile_m), + min(range_n - n, tile_n)); + } + } + } + } + } + } +} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_parallelize_6d_tile_2d) + +PTHREADPOOL_WEAK void pthreadpool_destroy(struct pthreadpool* threadpool) {} + +PTHREADPOOL_PRIVATE_IMPL(pthreadpool_destroy) diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h index eaa0707..bd1ab26 100644 --- a/src/threadpool-atomics.h +++ b/src/threadpool-atomics.h @@ -1,874 +1,185 @@ -#pragma once - +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef __PTHREADPOOL_SRC_THREADPOOL_ATOMICS_H_ +#define __PTHREADPOOL_SRC_THREADPOOL_ATOMICS_H_ + +/* Standard C headers */ +#include #include #include #include +/* Windows-specific headers */ +#ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#else +#include +#endif + /* SSE-specific headers */ -#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC) - #include +#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_X64) && !defined(_M_ARM64EC) +#include #endif /* ARM-specific headers */ #if defined(__ARM_ACLE) - #include +#include #endif /* MSVC-specific headers */ #ifdef _MSC_VER - #include +#include #endif - -#if defined(__wasm__) && defined(__clang__) - /* - * Clang for WebAssembly target lacks stdatomic.h header, - * even though it supports the necessary low-level intrinsics. - * Thus, we implement pthreadpool atomic functions on top of - * low-level Clang-specific interfaces for this target. - */ - - typedef _Atomic(uint32_t) pthreadpool_atomic_uint32_t; - typedef _Atomic(size_t) pthreadpool_atomic_size_t; - typedef _Atomic(void*) pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return __c11_atomic_load(address, __ATOMIC_RELAXED); - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return __c11_atomic_load(address, __ATOMIC_RELAXED); - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return __c11_atomic_load(address, __ATOMIC_RELAXED); - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return __c11_atomic_load(address, __ATOMIC_ACQUIRE); - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - return __c11_atomic_load(address, __ATOMIC_ACQUIRE); - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - __c11_atomic_store(address, value, __ATOMIC_RELAXED); - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - __c11_atomic_store(address, value, __ATOMIC_RELAXED); - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - __c11_atomic_store(address, value, __ATOMIC_RELAXED); - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - __c11_atomic_store(address, value, __ATOMIC_RELEASE); - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - __c11_atomic_store(address, value, __ATOMIC_RELEASE); - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return __c11_atomic_fetch_sub(address, 1, __ATOMIC_RELAXED) - 1; - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return __c11_atomic_fetch_sub(address, 1, __ATOMIC_RELEASE) - 1; - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return __c11_atomic_fetch_sub(address, 1, __ATOMIC_ACQ_REL) - 1; - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = __c11_atomic_load(value, __ATOMIC_RELAXED); - while (actual_value != 0) { - if (__c11_atomic_compare_exchange_weak( - value, &actual_value, actual_value - 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) - { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - __c11_atomic_thread_fence(__ATOMIC_ACQUIRE); - } - - static inline void pthreadpool_fence_release() { - __c11_atomic_thread_fence(__ATOMIC_RELEASE); - } -#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) - #include - - typedef _Atomic(uint32_t) pthreadpool_atomic_uint32_t; - typedef _Atomic(size_t) pthreadpool_atomic_size_t; - typedef _Atomic(void*) pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return atomic_load_explicit(address, memory_order_relaxed); - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return atomic_load_explicit(address, memory_order_relaxed); - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return atomic_load_explicit(address, memory_order_relaxed); - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return atomic_load_explicit(address, memory_order_acquire); - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - return atomic_load_explicit(address, memory_order_acquire); - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - atomic_store_explicit(address, value, memory_order_relaxed); - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - atomic_store_explicit(address, value, memory_order_relaxed); - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - atomic_store_explicit(address, value, memory_order_relaxed); - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - atomic_store_explicit(address, value, memory_order_release); - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - atomic_store_explicit(address, value, memory_order_release); - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return atomic_fetch_sub_explicit(address, 1, memory_order_relaxed) - 1; - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return atomic_fetch_sub_explicit(address, 1, memory_order_release) - 1; - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return atomic_fetch_sub_explicit(address, 1, memory_order_acq_rel) - 1; - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - #if defined(__clang__) && (defined(__arm__) || defined(__aarch64__)) - size_t actual_value; - do { - actual_value = __builtin_arm_ldrex((const volatile size_t*) value); - if (actual_value == 0) { - __builtin_arm_clrex(); - return false; - } - } while (__builtin_arm_strex(actual_value - 1, (volatile size_t*) value) != 0); - return true; - #else - size_t actual_value = pthreadpool_load_relaxed_size_t(value); - while (actual_value != 0) { - if (atomic_compare_exchange_weak_explicit( - value, &actual_value, actual_value - 1, memory_order_relaxed, memory_order_relaxed)) - { - return true; - } - } - return false; - #endif - } - - static inline void pthreadpool_fence_acquire() { - atomic_thread_fence(memory_order_acquire); - } - - static inline void pthreadpool_fence_release() { - atomic_thread_fence(memory_order_release); - } -#elif defined(__GNUC__) - typedef uint32_t volatile pthreadpool_atomic_uint32_t; - typedef size_t volatile pthreadpool_atomic_size_t; - typedef void* volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return *address; - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - *address = value; - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return __sync_sub_and_fetch(address, 1); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return __sync_sub_and_fetch(address, 1); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return __sync_sub_and_fetch(address, 1); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = *value; - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = __sync_val_compare_and_swap(value, expected_value, new_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - __sync_synchronize(); - } - - static inline void pthreadpool_fence_release() { - __sync_synchronize(); - } -#elif defined(_MSC_VER) && defined(_M_ARM) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return (uint32_t) __iso_volatile_load32((const volatile __int32*) address); - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) __iso_volatile_load32((const volatile __int32*) address); - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return (void*) __iso_volatile_load32((const volatile __int32*) address); - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - const uint32_t value = (uint32_t) __iso_volatile_load32((const volatile __int32*) address); - __dmb(_ARM_BARRIER_ISH); - _ReadBarrier(); - return value; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - const size_t value = (size_t) __iso_volatile_load32((const volatile __int32*) address); - __dmb(_ARM_BARRIER_ISH); - _ReadBarrier(); - return value; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - _WriteBarrier(); - __dmb(_ARM_BARRIER_ISH); - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - _WriteBarrier(); - __dmb(_ARM_BARRIER_ISH); - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement_nf((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement_rel((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = (size_t) __iso_volatile_load32((const volatile __int32*) value); - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange_nf( - (volatile long*) value, (long) new_value, (long) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - __dmb(_ARM_BARRIER_ISH); - _ReadBarrier(); - } - - static inline void pthreadpool_fence_release() { - _WriteBarrier(); - __dmb(_ARM_BARRIER_ISH); - } -#elif defined(_MSC_VER) && defined(_M_ARM64) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return (uint32_t) __iso_volatile_load32((const volatile __int32*) address); - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) __iso_volatile_load64((const volatile __int64*) address); - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return (void*) __iso_volatile_load64((const volatile __int64*) address); - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return (uint32_t) __ldar32((volatile unsigned __int32*) address); - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) __ldar64((volatile unsigned __int64*) address); - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - __iso_volatile_store32((volatile __int32*) address, (__int32) value); - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - __iso_volatile_store64((volatile __int64*) address, (__int64) value); - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - __iso_volatile_store64((volatile __int64*) address, (__int64) value); - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - _WriteBarrier(); - __stlr32((unsigned __int32 volatile*) address, (unsigned __int32) value); - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - _WriteBarrier(); - __stlr64((unsigned __int64 volatile*) address, (unsigned __int64) value); - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64_nf((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64_rel((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = (size_t) __iso_volatile_load64((const volatile __int64*) value); - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange64_nf( - (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - __dmb(_ARM64_BARRIER_ISHLD); - _ReadBarrier(); - } - - static inline void pthreadpool_fence_release() { - _WriteBarrier(); - __dmb(_ARM64_BARRIER_ISH); - } -#elif defined(_MSC_VER) && defined(_M_IX86) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return *address; - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - /* x86 loads always have acquire semantics; use only a compiler barrier */ - const uint32_t value = *address; - _ReadBarrier(); - return value; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - /* x86 loads always have acquire semantics; use only a compiler barrier */ - const size_t value = *address; - _ReadBarrier(); - return value; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - *address = value; - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - /* x86 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - /* x86 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement((volatile long*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = *value; - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange( - (volatile long*) value, (long) new_value, (long) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - _mm_lfence(); - } - - static inline void pthreadpool_fence_release() { - _mm_sfence(); - } -#elif defined(_MSC_VER) && defined(_M_X64) - typedef volatile uint32_t pthreadpool_atomic_uint32_t; - typedef volatile size_t pthreadpool_atomic_size_t; - typedef void *volatile pthreadpool_atomic_void_p; - - static inline uint32_t pthreadpool_load_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - return *address; - } - - static inline size_t pthreadpool_load_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return *address; - } - - static inline void* pthreadpool_load_relaxed_void_p( - pthreadpool_atomic_void_p* address) - { - return *address; - } - - static inline uint32_t pthreadpool_load_acquire_uint32_t( - pthreadpool_atomic_uint32_t* address) - { - /* x86-64 loads always have acquire semantics; use only a compiler barrier */ - const uint32_t value = *address; - _ReadBarrier(); - return value; - } - - static inline size_t pthreadpool_load_acquire_size_t( - pthreadpool_atomic_size_t* address) - { - /* x86-64 loads always have acquire semantics; use only a compiler barrier */ - const size_t value = *address; - _ReadBarrier(); - return value; - } - - static inline void pthreadpool_store_relaxed_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - *address = value; - } - - static inline void pthreadpool_store_relaxed_void_p( - pthreadpool_atomic_void_p* address, - void* value) - { - *address = value; - } - - static inline void pthreadpool_store_release_uint32_t( - pthreadpool_atomic_uint32_t* address, - uint32_t value) - { - /* x86-64 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline void pthreadpool_store_release_size_t( - pthreadpool_atomic_size_t* address, - size_t value) - { - /* x86-64 stores always have release semantics; use only a compiler barrier */ - _WriteBarrier(); - *address = value; - } - - static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( - pthreadpool_atomic_size_t* address) - { - return (size_t) _InterlockedDecrement64((volatile __int64*) address); - } - - static inline bool pthreadpool_try_decrement_relaxed_size_t( - pthreadpool_atomic_size_t* value) - { - size_t actual_value = *value; - while (actual_value != 0) { - const size_t new_value = actual_value - 1; - const size_t expected_value = actual_value; - actual_value = _InterlockedCompareExchange64( - (volatile __int64*) value, (__int64) new_value, (__int64) expected_value); - if (actual_value == expected_value) { - return true; - } - } - return false; - } - - static inline void pthreadpool_fence_acquire() { - _mm_lfence(); - _ReadBarrier(); - } - - static inline void pthreadpool_fence_release() { - _WriteBarrier(); - _mm_sfence(); - } +/* Configuration header */ +#include "threadpool-common.h" + +/* Align the atomic values on the size of a cache line to avoid false sharing, + * i.e. two or more atomic variables sharing the same cache line will block + * each other during atomic operations. + */ +typedef atomic_uint_fast32_t PTHREADPOOL_CACHELINE_ALIGNED + pthreadpool_atomic_uint32_t; +typedef atomic_size_t PTHREADPOOL_CACHELINE_ALIGNED pthreadpool_atomic_size_t; +typedef atomic_uintptr_t PTHREADPOOL_CACHELINE_ALIGNED + pthreadpool_atomic_void_p; + +static inline uint32_t pthreadpool_load_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address) { + return atomic_load_explicit(address, memory_order_relaxed); +} + +static inline size_t pthreadpool_load_relaxed_size_t( + pthreadpool_atomic_size_t* address) { + return atomic_load_explicit(address, memory_order_relaxed); +} + +static inline void* pthreadpool_load_relaxed_void_p( + pthreadpool_atomic_void_p* address) { + return (void*)atomic_load_explicit(address, memory_order_relaxed); +} + +static inline uint32_t pthreadpool_load_acquire_uint32_t( + pthreadpool_atomic_uint32_t* address) { + return atomic_load_explicit(address, memory_order_acquire); +} + +static inline size_t pthreadpool_load_acquire_size_t( + pthreadpool_atomic_size_t* address) { + return atomic_load_explicit(address, memory_order_acquire); +} + +static inline void pthreadpool_store_relaxed_uint32_t( + pthreadpool_atomic_uint32_t* address, uint32_t value) { + atomic_store_explicit(address, value, memory_order_relaxed); +} + +static inline void pthreadpool_store_relaxed_size_t( + pthreadpool_atomic_size_t* address, size_t value) { + atomic_store_explicit(address, value, memory_order_relaxed); +} + +static inline void pthreadpool_store_relaxed_void_p( + pthreadpool_atomic_void_p* address, void* value) { + atomic_store_explicit(address, (uintptr_t)value, memory_order_relaxed); +} + +static inline void pthreadpool_store_release_uint32_t( + pthreadpool_atomic_uint32_t* address, uint32_t value) { + atomic_store_explicit(address, value, memory_order_release); +} + +static inline void pthreadpool_store_release_size_t( + pthreadpool_atomic_size_t* address, size_t value) { + atomic_store_explicit(address, value, memory_order_release); +} + +static inline size_t pthreadpool_decrement_fetch_relaxed_size_t( + pthreadpool_atomic_size_t* address) { + return atomic_fetch_sub_explicit(address, 1, memory_order_relaxed) - 1; +} + +static inline size_t pthreadpool_decrement_n_fetch_relaxed_size_t( + pthreadpool_atomic_size_t* address, size_t n) { + return atomic_fetch_sub_explicit(address, n, memory_order_relaxed) - n; +} + +static inline size_t pthreadpool_fetch_decrement_n_relaxed_size_t( + pthreadpool_atomic_size_t* address, size_t n) { + return atomic_fetch_sub_explicit(address, n, memory_order_relaxed); +} + +static inline size_t pthreadpool_decrement_fetch_release_size_t( + pthreadpool_atomic_size_t* address) { + return atomic_fetch_sub_explicit(address, 1, memory_order_release) - 1; +} + +static inline size_t pthreadpool_decrement_fetch_acquire_release_size_t( + pthreadpool_atomic_size_t* address) { + return atomic_fetch_sub_explicit(address, 1, memory_order_acq_rel) - 1; +} + +static inline bool pthreadpool_try_decrement_relaxed_size_t( + pthreadpool_atomic_size_t* value) { + size_t actual_value = atomic_load_explicit(value, memory_order_acquire); + while (actual_value != 0) { + if (atomic_compare_exchange_weak_explicit( + value, &actual_value, actual_value - 1, memory_order_relaxed, + memory_order_relaxed)) { + return true; + } + } + return false; +} + +static inline size_t pthreadpool_fetch_add_relaxed_size_t( + pthreadpool_atomic_size_t* address, size_t value) { + return atomic_fetch_add_explicit(address, value, memory_order_relaxed); +} + +static inline void pthreadpool_fence_acquire() { + atomic_thread_fence(memory_order_acquire); +} + +static inline void pthreadpool_fence_release() { + atomic_thread_fence(memory_order_release); +} + +static inline void pthreadpool_yield(uint32_t step) { + if (step < PTHREADPOOL_SPIN_PAUSE_ITERATIONS) { +#if defined(__ARM_ACLE) || \ + defined(_MSC_VER) && \ + (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)) + __yield(); +#elif defined(__GNUC__) && \ + (defined(__ARM_ARCH) && (__ARM_ARCH >= 7) || \ + (defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6KZ__)) && \ + !defined(__thumb__)) + __asm__ __volatile__("yield"); +#elif defined(__i386__) || defined(__i686__) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_X64) + _mm_pause(); #else - #error "Platform-specific implementation of threadpool-atomics.h required" + pthreadpool_fence_acquire(); #endif - -#if defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)) - static inline void pthreadpool_yield() { - __yield(); - } -#elif defined(__GNUC__) && (defined(__ARM_ARCH) && (__ARM_ARCH >= 7) || (defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6KZ__)) && !defined(__thumb__)) - static inline void pthreadpool_yield() { - __asm__ __volatile__("yield"); - } -#elif defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - static inline void pthreadpool_yield() { - _mm_pause(); - } + } else { +#ifdef _WIN32 + Sleep(0); #else - static inline void pthreadpool_yield() { - pthreadpool_fence_acquire(); - } + sched_yield(); #endif + } +} + +#endif // __PTHREADPOOL_SRC_THREADPOOL_ATOMICS_H_ diff --git a/src/threadpool-common.h b/src/threadpool-common.h index ca84744..087cda1 100644 --- a/src/threadpool-common.h +++ b/src/threadpool-common.h @@ -1,75 +1,119 @@ -#pragma once +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef __PTHREADPOOL_SRC_THREADPOOL_COMMON_H_ +#define __PTHREADPOOL_SRC_THREADPOOL_COMMON_H_ #ifndef PTHREADPOOL_USE_CPUINFO - #define PTHREADPOOL_USE_CPUINFO 0 +#define PTHREADPOOL_USE_CPUINFO 0 #endif #ifndef PTHREADPOOL_USE_FUTEX - #if defined(__linux__) - #define PTHREADPOOL_USE_FUTEX 1 - #elif defined(__EMSCRIPTEN__) - #define PTHREADPOOL_USE_FUTEX 1 - #else - #define PTHREADPOOL_USE_FUTEX 0 - #endif +#if defined(__linux__) +#define PTHREADPOOL_USE_FUTEX 1 +#elif defined(__EMSCRIPTEN__) +#define PTHREADPOOL_USE_FUTEX 1 +#else +#define PTHREADPOOL_USE_FUTEX 0 +#endif #endif #ifndef PTHREADPOOL_USE_GCD - #if defined(__APPLE__) - #define PTHREADPOOL_USE_GCD 1 - #else - #define PTHREADPOOL_USE_GCD 0 - #endif +#if defined(__APPLE__) +#define PTHREADPOOL_USE_GCD 1 +#else +#define PTHREADPOOL_USE_GCD 0 +#endif #endif #ifndef PTHREADPOOL_USE_EVENT - #if defined(_WIN32) || defined(__CYGWIN__) - #define PTHREADPOOL_USE_EVENT 1 - #else - #define PTHREADPOOL_USE_EVENT 0 - #endif +#if defined(_WIN32) || defined(__CYGWIN__) +#define PTHREADPOOL_USE_EVENT 1 +#else +#define PTHREADPOOL_USE_EVENT 0 +#endif #endif #ifndef PTHREADPOOL_USE_CONDVAR - #if PTHREADPOOL_USE_GCD || PTHREADPOOL_USE_FUTEX || PTHREADPOOL_USE_EVENT - #define PTHREADPOOL_USE_CONDVAR 0 - #else - #define PTHREADPOOL_USE_CONDVAR 1 - #endif +#if PTHREADPOOL_USE_GCD || PTHREADPOOL_USE_FUTEX || PTHREADPOOL_USE_EVENT +#define PTHREADPOOL_USE_CONDVAR 0 +#else +#define PTHREADPOOL_USE_CONDVAR 1 +#endif #endif +/* Number of iterations in spin-wait loop before going into futex/condvar wait + */ +#if defined(__ANDROID__) +/* We really don't want the process to sleep on Android, so spin for much longer + * than we otherwise would. */ +#define PTHREADPOOL_SPIN_YIELD_ITERATIONS 10 +#define PTHREADPOOL_SPIN_PAUSE_ITERATIONS 100000 +#else -/* Number of iterations in spin-wait loop before going into futex/condvar wait */ -#define PTHREADPOOL_SPIN_WAIT_ITERATIONS 1000000 +#define PTHREADPOOL_SPIN_YIELD_ITERATIONS 0 +#define PTHREADPOOL_SPIN_PAUSE_ITERATIONS 1000 +#endif // defined(__ANDROID__) +#define PTHREADPOOL_SPIN_WAIT_ITERATIONS \ + (PTHREADPOOL_SPIN_PAUSE_ITERATIONS + PTHREADPOOL_SPIN_YIELD_ITERATIONS) #define PTHREADPOOL_CACHELINE_SIZE 64 #if defined(__GNUC__) - #define PTHREADPOOL_CACHELINE_ALIGNED __attribute__((__aligned__(PTHREADPOOL_CACHELINE_SIZE))) +#define PTHREADPOOL_CACHELINE_ALIGNED \ + __attribute__((__aligned__(PTHREADPOOL_CACHELINE_SIZE))) #elif defined(_MSC_VER) - #define PTHREADPOOL_CACHELINE_ALIGNED __declspec(align(PTHREADPOOL_CACHELINE_SIZE)) +#define PTHREADPOOL_CACHELINE_ALIGNED \ + __declspec(align(PTHREADPOOL_CACHELINE_SIZE)) #else - #error "Platform-specific implementation of PTHREADPOOL_CACHELINE_ALIGNED required" +#error \ + "Platform-specific implementation of PTHREADPOOL_CACHELINE_ALIGNED required" #endif #if defined(__clang__) - #if __has_extension(c_static_assert) || __has_feature(c_static_assert) - #define PTHREADPOOL_STATIC_ASSERT(predicate, message) _Static_assert((predicate), message) - #else - #define PTHREADPOOL_STATIC_ASSERT(predicate, message) - #endif -#elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6)) - /* Static assert is supported by gcc >= 4.6 */ - #define PTHREADPOOL_STATIC_ASSERT(predicate, message) _Static_assert((predicate), message) +#if __has_extension(c_static_assert) || __has_feature(c_static_assert) +#define PTHREADPOOL_STATIC_ASSERT(predicate, message) \ + _Static_assert((predicate), message) +#else +#define PTHREADPOOL_STATIC_ASSERT(predicate, message) +#endif +#elif defined(__GNUC__) && \ + ((__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6)) +/* Static assert is supported by gcc >= 4.6 */ +#define PTHREADPOOL_STATIC_ASSERT(predicate, message) \ + _Static_assert((predicate), message) +#else +#define PTHREADPOOL_STATIC_ASSERT(predicate, message) +#endif + +// We declare these symbols as having weak linkage, so they can be replaced by +// a custom implementation. +#if defined(__GNUC__) +#define PTHREADPOOL_WEAK __attribute__((__weak__)) #else - #define PTHREADPOOL_STATIC_ASSERT(predicate, message) +#define PTHREADPOOL_WEAK +#endif + +#if defined(__GNUC__) && defined(__linux__) +#define PTHREADPOOL_PRIVATE_IMPL(name) \ + extern __typeof(name) name##_private_impl __attribute__((alias(#name))); +#else +#define PTHREADPOOL_PRIVATE_IMPL(name) #endif #ifndef PTHREADPOOL_INTERNAL - #if defined(__ELF__) - #define PTHREADPOOL_INTERNAL __attribute__((__visibility__("internal"))) - #elif defined(__MACH__) - #define PTHREADPOOL_INTERNAL __attribute__((__visibility__("hidden"))) - #else - #define PTHREADPOOL_INTERNAL - #endif +#if defined(__ELF__) +#define PTHREADPOOL_INTERNAL __attribute__((__visibility__("internal"))) +#elif defined(__MACH__) +#define PTHREADPOOL_INTERNAL __attribute__((__visibility__("hidden"))) +#else +#define PTHREADPOOL_INTERNAL #endif +#endif + +#endif // __PTHREADPOOL_SRC_THREADPOOL_COMMON_H_ diff --git a/src/threadpool-object.h b/src/threadpool-object.h index 52db369..93c67b5 100644 --- a/src/threadpool-object.h +++ b/src/threadpool-object.h @@ -1,12 +1,22 @@ -#pragma once +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef __PTHREADPOOL_SRC_THREADPOOL_OBJECT_H_ +#define __PTHREADPOOL_SRC_THREADPOOL_OBJECT_H_ /* Standard C headers */ #include #include /* Internal headers */ -#include "threadpool-common.h" #include "threadpool-atomics.h" +#include "threadpool-common.h" /* POSIX headers */ #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX @@ -32,864 +42,1202 @@ /* Library header */ #include - #define THREADPOOL_COMMAND_MASK UINT32_C(0x7FFFFFFF) enum threadpool_command { - threadpool_command_init, - threadpool_command_parallelize, - threadpool_command_shutdown, + threadpool_command_init, + threadpool_command_parallelize, + threadpool_command_shutdown, }; struct PTHREADPOOL_CACHELINE_ALIGNED thread_info { - /** - * Index of the first element in the work range. - * Before processing a new element the owning worker thread increments this value. - */ - pthreadpool_atomic_size_t range_start; - /** - * Index of the element after the last element of the work range. - * Before processing a new element the stealing worker thread decrements this value. - */ - pthreadpool_atomic_size_t range_end; - /** - * The number of elements in the work range. - * Due to race conditions range_length <= range_end - range_start. - * The owning worker thread must decrement this value before incrementing @a range_start. - * The stealing worker thread must decrement this value before decrementing @a range_end. - */ - pthreadpool_atomic_size_t range_length; - /** - * Thread number in the 0..threads_count-1 range. - */ - size_t thread_number; - /** - * Thread pool which owns the thread. - */ - struct pthreadpool* threadpool; + /** + * Index of the first element in the work range. + * Before processing a new element the owning worker thread increments this + * value. + */ + pthreadpool_atomic_size_t range_start; + /** + * Index of the element after the last element of the work range. + * Before processing a new element the stealing worker thread decrements this + * value. + */ + pthreadpool_atomic_size_t range_end; + /** + * The number of elements in the work range. + * Due to race conditions range_length <= range_end - range_start. + * The owning worker thread must decrement this value before incrementing @a + * range_start. The stealing worker thread must decrement this value before + * decrementing @a range_end. + */ + pthreadpool_atomic_size_t range_length; + /** + * Thread number in the 0..threads_count-1 range. + */ + size_t thread_number; + /** + * Thread pool which owns the thread. + */ + struct pthreadpool* threadpool; #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX - /** - * The pthread object corresponding to the thread. - */ - pthread_t thread_object; + /** + * The pthread object corresponding to the thread. + */ + pthread_t thread_object; #endif #if PTHREADPOOL_USE_EVENT - /** - * The Windows thread handle corresponding to the thread. - */ - HANDLE thread_handle; + /** + * The Windows thread handle corresponding to the thread. + */ + HANDLE thread_handle; #endif }; -PTHREADPOOL_STATIC_ASSERT(sizeof(struct thread_info) % PTHREADPOOL_CACHELINE_SIZE == 0, - "thread_info structure must occupy an integer number of cache lines (64 bytes)"); +PTHREADPOOL_STATIC_ASSERT(sizeof(struct thread_info) % + PTHREADPOOL_CACHELINE_SIZE == + 0, + "thread_info structure must occupy an integer number " + "of cache lines (64 bytes)"); struct pthreadpool_1d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_1d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_1d_with_uarch function. - */ - uint32_t max_uarch_index; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_1d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_1d_with_uarch function. + */ + uint32_t max_uarch_index; }; struct pthreadpool_1d_tile_1d_params { - /** - * Copy of the range argument passed to the pthreadpool_parallelize_1d_tile_1d function. - */ - size_t range; - /** - * Copy of the tile argument passed to the pthreadpool_parallelize_1d_tile_1d function. - */ - size_t tile; + /** + * Copy of the range argument passed to the pthreadpool_parallelize_1d_tile_1d + * function. + */ + size_t range; + /** + * Copy of the tile argument passed to the pthreadpool_parallelize_1d_tile_1d + * function. + */ + size_t tile; +}; + +struct pthreadpool_1d_tile_1d_dynamic_params { + /** + * Copy of the range argument passed to the + * pthreadpool_parallelize_1d_tile_1d_dynamic function. + */ + size_t range; + /** + * Copy of the tile argument passed to the + * pthreadpool_parallelize_1d_tile_1d_dynamic function. + */ + size_t tile; }; struct pthreadpool_2d_params { - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_2d function. - */ - struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_2d function. + */ + struct fxdiv_divisor_size_t range_j; }; struct pthreadpool_2d_tile_1d_params { - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_1d function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_1d function. - */ - size_t tile_j; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d function. + */ + size_t tile_j; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; }; struct pthreadpool_2d_tile_1d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_1d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_1d_with_uarch function. - */ - uint32_t max_uarch_index; - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_1d function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_1d function. - */ - size_t tile_j; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_1d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_1d_with_uarch function. + */ + uint32_t max_uarch_index; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d function. + */ + size_t tile_j; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; +}; + +struct pthreadpool_2d_tile_1d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_1d_dynamic function. + */ + size_t tile_j; }; struct pthreadpool_2d_tile_2d_params { - /** - * Copy of the range_i argument passed to the pthreadpool_parallelize_2d_tile_2d function. - */ - size_t range_i; - /** - * Copy of the tile_i argument passed to the pthreadpool_parallelize_2d_tile_2d function. - */ - size_t tile_i; - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_2d function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_2d function. - */ - size_t tile_j; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d function. + */ + size_t range_i; + /** + * Copy of the tile_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d function. + */ + size_t tile_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d function. + */ + size_t tile_j; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; }; struct pthreadpool_2d_tile_2d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - uint32_t max_uarch_index; - /** - * Copy of the range_i argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - size_t range_i; - /** - * Copy of the tile_i argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - size_t tile_i; - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_2d_tile_2d_with_uarch function. - */ - size_t tile_j; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + uint32_t max_uarch_index; + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + size_t range_i; + /** + * Copy of the tile_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + size_t tile_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_with_uarch function. + */ + size_t tile_j; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; +}; + +struct pthreadpool_2d_tile_2d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the tile_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_i; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_j; +}; + +struct pthreadpool_2d_tile_2d_dynamic_with_uarch_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the tile_i argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_i; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic function. + */ + size_t tile_j; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch function. + */ + uint32_t max_uarch_index; }; struct pthreadpool_3d_params { - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_3d function. - */ - struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_3d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the + * pthreadpool_parallelize_3d function. + */ + struct fxdiv_divisor_size_t range_k; }; struct pthreadpool_3d_tile_1d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_1d function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_1d function. - */ - size_t tile_k; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) value. - */ - struct fxdiv_divisor_size_t tile_range_k; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_1d function. + */ + size_t tile_k; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) value. + */ + struct fxdiv_divisor_size_t tile_range_k; }; struct pthreadpool_3d_tile_1d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_1d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_1d_with_uarch function. - */ - uint32_t max_uarch_index; - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_1d_with_uarch function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_1d_with_uarch function. - */ - size_t tile_k; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d_tile_1d_with_uarch function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) value. - */ - struct fxdiv_divisor_size_t tile_range_k; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_1d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_1d_with_uarch function. + */ + uint32_t max_uarch_index; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_1d_with_uarch function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_1d_with_uarch function. + */ + size_t tile_k; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_1d_with_uarch function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) value. + */ + struct fxdiv_divisor_size_t tile_range_k; }; struct pthreadpool_3d_tile_2d_params { - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_3d_tile_2d function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_3d_tile_2d function. - */ - size_t tile_j; - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_2d function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_2d function. - */ - size_t tile_k; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) value. - */ - struct fxdiv_divisor_size_t tile_range_k; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d function. + */ + size_t tile_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d function. + */ + size_t tile_k; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) value. + */ + struct fxdiv_divisor_size_t tile_range_k; }; struct pthreadpool_3d_tile_2d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - uint32_t max_uarch_index; - /** - * Copy of the range_j argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - size_t range_j; - /** - * Copy of the tile_j argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - size_t tile_j; - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_2d_with_uarch function. - */ - size_t tile_k; - /** - * FXdiv divisor for the divide_round_up(range_j, tile_j) value. - */ - struct fxdiv_divisor_size_t tile_range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) value. - */ - struct fxdiv_divisor_size_t tile_range_k; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + uint32_t max_uarch_index; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + size_t range_j; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + size_t tile_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_with_uarch function. + */ + size_t tile_k; + /** + * FXdiv divisor for the divide_round_up(range_j, tile_j) value. + */ + struct fxdiv_divisor_size_t tile_range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) value. + */ + struct fxdiv_divisor_size_t tile_range_k; +}; + +struct pthreadpool_3d_tile_2d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_k; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_j; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_k; +}; + +struct pthreadpool_3d_tile_2d_dynamic_with_uarch_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t range_k; + /** + * Copy of the tile_j argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_j; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic function. + */ + size_t tile_k; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch function. + */ + uint32_t max_uarch_index; }; struct pthreadpool_4d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_4d function. - */ - size_t range_k; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k * range_l value. - */ - struct fxdiv_divisor_size_t range_kl; - /** - * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_4d function. - */ - struct fxdiv_divisor_size_t range_l; + /** + * Copy of the range_k argument passed to the pthreadpool_parallelize_4d + * function. + */ + size_t range_k; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_4d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * range_l value. + */ + struct fxdiv_divisor_size_t range_kl; + /** + * FXdiv divisor for the range_l argument passed to the + * pthreadpool_parallelize_4d function. + */ + struct fxdiv_divisor_size_t range_l; }; struct pthreadpool_4d_tile_1d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_1d function. - */ - size_t range_k; - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_1d function. - */ - size_t range_l; - /** - * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_1d function. - */ - size_t tile_l; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k * divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_kl; - /** - * FXdiv divisor for the divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_l; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_4d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_4d_tile_1d function. + */ + size_t range_l; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_4d_tile_1d function. + */ + size_t tile_l; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_4d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_kl; + /** + * FXdiv divisor for the divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_l; }; struct pthreadpool_4d_tile_2d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_2d function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_4d_tile_2d function. - */ - size_t tile_k; - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_2d function. - */ - size_t range_l; - /** - * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_2d function. - */ - size_t tile_l; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_2d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) * divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_kl; - /** - * FXdiv divisor for the divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_l; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d function. + */ + size_t tile_k; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d function. + */ + size_t range_l; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d function. + */ + size_t tile_l; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_4d_tile_2d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) * + * divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_kl; + /** + * FXdiv divisor for the divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_l; }; struct pthreadpool_4d_tile_2d_with_uarch_params { - /** - * Copy of the default_uarch_index argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - uint32_t default_uarch_index; - /** - * Copy of the max_uarch_index argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - uint32_t max_uarch_index; - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - size_t range_k; - /** - * Copy of the tile_k argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - size_t tile_k; - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - size_t range_l; - /** - * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - size_t tile_l; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_2d_with_uarch function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the divide_round_up(range_k, tile_k) * divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_kl; - /** - * FXdiv divisor for the divide_round_up(range_l, tile_l) value. - */ - struct fxdiv_divisor_size_t tile_range_l; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + uint32_t max_uarch_index; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + size_t tile_k; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + size_t range_l; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + size_t tile_l; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_4d_tile_2d_with_uarch function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) * + * divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_kl; + /** + * FXdiv divisor for the divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_l; +}; + +struct pthreadpool_4d_tile_2d_dynamic_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t range_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t range_k; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t range_l; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t tile_k; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic function. + */ + size_t tile_l; +}; + +struct pthreadpool_4d_tile_2d_dynamic_with_uarch_params { + /** + * Copy of the range_i argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t range_i; + /** + * Copy of the range_j argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t range_j; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t range_k; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t range_l; + /** + * Copy of the tile_k argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t tile_k; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + size_t tile_l; + /** + * Copy of the default_uarch_index argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + uint32_t default_uarch_index; + /** + * Copy of the max_uarch_index argument passed to the + * pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch function. + */ + uint32_t max_uarch_index; }; struct pthreadpool_5d_params { - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_5d function. - */ - size_t range_l; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_5d function. - */ - struct fxdiv_divisor_size_t range_k; - /** - * FXdiv divisor for the range_l * range_m value. - */ - struct fxdiv_divisor_size_t range_lm; - /** - * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_5d function. - */ - struct fxdiv_divisor_size_t range_m; + /** + * Copy of the range_l argument passed to the pthreadpool_parallelize_5d + * function. + */ + size_t range_l; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the + * pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the range_l * range_m value. + */ + struct fxdiv_divisor_size_t range_lm; + /** + * FXdiv divisor for the range_m argument passed to the + * pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_m; }; struct pthreadpool_5d_tile_1d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_5d_tile_1d function. - */ - size_t range_k; - /** - * Copy of the range_m argument passed to the pthreadpool_parallelize_5d_tile_1d function. - */ - size_t range_m; - /** - * Copy of the tile_m argument passed to the pthreadpool_parallelize_5d_tile_1d function. - */ - size_t tile_m; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k * range_l value. - */ - struct fxdiv_divisor_size_t range_kl; - /** - * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_5d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_l; - /** - * FXdiv divisor for the divide_round_up(range_m, tile_m) value. - */ - struct fxdiv_divisor_size_t tile_range_m; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_5d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the range_m argument passed to the + * pthreadpool_parallelize_5d_tile_1d function. + */ + size_t range_m; + /** + * Copy of the tile_m argument passed to the + * pthreadpool_parallelize_5d_tile_1d function. + */ + size_t tile_m; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_5d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * range_l value. + */ + struct fxdiv_divisor_size_t range_kl; + /** + * FXdiv divisor for the range_l argument passed to the + * pthreadpool_parallelize_5d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_l; + /** + * FXdiv divisor for the divide_round_up(range_m, tile_m) value. + */ + struct fxdiv_divisor_size_t tile_range_m; }; struct pthreadpool_5d_tile_2d_params { - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - size_t range_l; - /** - * Copy of the tile_l argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - size_t tile_l; - /** - * Copy of the range_m argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - size_t range_m; - /** - * Copy of the tile_m argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - size_t tile_m; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_5d_tile_2d function. - */ - struct fxdiv_divisor_size_t range_k; - /** - * FXdiv divisor for the divide_round_up(range_l, tile_l) * divide_round_up(range_m, tile_m) value. - */ - struct fxdiv_divisor_size_t tile_range_lm; - /** - * FXdiv divisor for the divide_round_up(range_m, tile_m) value. - */ - struct fxdiv_divisor_size_t tile_range_m; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + size_t range_l; + /** + * Copy of the tile_l argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + size_t tile_l; + /** + * Copy of the range_m argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + size_t range_m; + /** + * Copy of the tile_m argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + size_t tile_m; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the + * pthreadpool_parallelize_5d_tile_2d function. + */ + struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the divide_round_up(range_l, tile_l) * + * divide_round_up(range_m, tile_m) value. + */ + struct fxdiv_divisor_size_t tile_range_lm; + /** + * FXdiv divisor for the divide_round_up(range_m, tile_m) value. + */ + struct fxdiv_divisor_size_t tile_range_m; }; struct pthreadpool_6d_params { - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_6d function. - */ - size_t range_l; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_6d function. - */ - struct fxdiv_divisor_size_t range_k; - /** - * FXdiv divisor for the range_l * range_m * range_n value. - */ - struct fxdiv_divisor_size_t range_lmn; - /** - * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_6d function. - */ - struct fxdiv_divisor_size_t range_m; - /** - * FXdiv divisor for the range_n argument passed to the pthreadpool_parallelize_6d function. - */ - struct fxdiv_divisor_size_t range_n; + /** + * Copy of the range_l argument passed to the pthreadpool_parallelize_6d + * function. + */ + size_t range_l; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_6d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the + * pthreadpool_parallelize_6d function. + */ + struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the range_l * range_m * range_n value. + */ + struct fxdiv_divisor_size_t range_lmn; + /** + * FXdiv divisor for the range_m argument passed to the + * pthreadpool_parallelize_6d function. + */ + struct fxdiv_divisor_size_t range_m; + /** + * FXdiv divisor for the range_n argument passed to the + * pthreadpool_parallelize_6d function. + */ + struct fxdiv_divisor_size_t range_n; }; struct pthreadpool_6d_tile_1d_params { - /** - * Copy of the range_l argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - size_t range_l; - /** - * Copy of the range_n argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - size_t range_n; - /** - * Copy of the tile_n argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - size_t tile_n; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_k; - /** - * FXdiv divisor for the range_l * range_m * divide_round_up(range_n, tile_n) value. - */ - struct fxdiv_divisor_size_t tile_range_lmn; - /** - * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_6d_tile_1d function. - */ - struct fxdiv_divisor_size_t range_m; - /** - * FXdiv divisor for the divide_round_up(range_n, tile_n) value. - */ - struct fxdiv_divisor_size_t tile_range_n; + /** + * Copy of the range_l argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + size_t range_l; + /** + * Copy of the range_n argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + size_t range_n; + /** + * Copy of the tile_n argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + size_t tile_n; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the range_l * range_m * divide_round_up(range_n, tile_n) + * value. + */ + struct fxdiv_divisor_size_t tile_range_lmn; + /** + * FXdiv divisor for the range_m argument passed to the + * pthreadpool_parallelize_6d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_m; + /** + * FXdiv divisor for the divide_round_up(range_n, tile_n) value. + */ + struct fxdiv_divisor_size_t tile_range_n; }; struct pthreadpool_6d_tile_2d_params { - /** - * Copy of the range_k argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - size_t range_k; - /** - * Copy of the range_m argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - size_t range_m; - /** - * Copy of the tile_m argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - size_t tile_m; - /** - * Copy of the range_n argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - size_t range_n; - /** - * Copy of the tile_n argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - size_t tile_n; - /** - * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - struct fxdiv_divisor_size_t range_j; - /** - * FXdiv divisor for the range_k * range_l value. - */ - struct fxdiv_divisor_size_t range_kl; - /** - * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_6d_tile_2d function. - */ - struct fxdiv_divisor_size_t range_l; - /** - * FXdiv divisor for the divide_round_up(range_m, tile_m) * divide_round_up(range_n, tile_n) value. - */ - struct fxdiv_divisor_size_t tile_range_mn; - /** - * FXdiv divisor for the divide_round_up(range_n, tile_n) value. - */ - struct fxdiv_divisor_size_t tile_range_n; + /** + * Copy of the range_k argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + size_t range_k; + /** + * Copy of the range_m argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + size_t range_m; + /** + * Copy of the tile_m argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + size_t tile_m; + /** + * Copy of the range_n argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + size_t range_n; + /** + * Copy of the tile_n argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + size_t tile_n; + /** + * FXdiv divisor for the range_j argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * range_l value. + */ + struct fxdiv_divisor_size_t range_kl; + /** + * FXdiv divisor for the range_l argument passed to the + * pthreadpool_parallelize_6d_tile_2d function. + */ + struct fxdiv_divisor_size_t range_l; + /** + * FXdiv divisor for the divide_round_up(range_m, tile_m) * + * divide_round_up(range_n, tile_n) value. + */ + struct fxdiv_divisor_size_t tile_range_mn; + /** + * FXdiv divisor for the divide_round_up(range_n, tile_n) value. + */ + struct fxdiv_divisor_size_t tile_range_n; }; struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { #if !PTHREADPOOL_USE_GCD - /** - * The number of threads that are processing an operation. - */ - pthreadpool_atomic_size_t active_threads; + /** + * The number of threads that are processing an operation. + */ + pthreadpool_atomic_size_t active_threads; #endif #if PTHREADPOOL_USE_FUTEX - /** - * Indicates if there are active threads. - * Only two values are possible: - * - has_active_threads == 0 if active_threads == 0 - * - has_active_threads == 1 if active_threads != 0 - */ - pthreadpool_atomic_uint32_t has_active_threads; + /** + * Indicates if there are active threads. + * Only two values are possible: + * - has_active_threads == 0 if active_threads == 0 + * - has_active_threads == 1 if active_threads != 0 + */ + pthreadpool_atomic_uint32_t has_active_threads; #endif #if !PTHREADPOOL_USE_GCD - /** - * The last command submitted to the thread pool. - */ - pthreadpool_atomic_uint32_t command; + /** + * The last command submitted to the thread pool. + */ + pthreadpool_atomic_uint32_t command; #endif - /** - * The entry point function to call for each thread in the thread pool for parallelization tasks. - */ - pthreadpool_atomic_void_p thread_function; - /** - * The function to call for each item. - */ - pthreadpool_atomic_void_p task; - /** - * The first argument to the item processing function. - */ - pthreadpool_atomic_void_p argument; - /** - * Additional parallelization parameters. - * These parameters are specific for each thread_function. - */ - union { - struct pthreadpool_1d_with_uarch_params parallelize_1d_with_uarch; - struct pthreadpool_1d_tile_1d_params parallelize_1d_tile_1d; - struct pthreadpool_2d_params parallelize_2d; - struct pthreadpool_2d_tile_1d_params parallelize_2d_tile_1d; - struct pthreadpool_2d_tile_1d_with_uarch_params parallelize_2d_tile_1d_with_uarch; - struct pthreadpool_2d_tile_2d_params parallelize_2d_tile_2d; - struct pthreadpool_2d_tile_2d_with_uarch_params parallelize_2d_tile_2d_with_uarch; - struct pthreadpool_3d_params parallelize_3d; - struct pthreadpool_3d_tile_1d_params parallelize_3d_tile_1d; - struct pthreadpool_3d_tile_1d_with_uarch_params parallelize_3d_tile_1d_with_uarch; - struct pthreadpool_3d_tile_2d_params parallelize_3d_tile_2d; - struct pthreadpool_3d_tile_2d_with_uarch_params parallelize_3d_tile_2d_with_uarch; - struct pthreadpool_4d_params parallelize_4d; - struct pthreadpool_4d_tile_1d_params parallelize_4d_tile_1d; - struct pthreadpool_4d_tile_2d_params parallelize_4d_tile_2d; - struct pthreadpool_4d_tile_2d_with_uarch_params parallelize_4d_tile_2d_with_uarch; - struct pthreadpool_5d_params parallelize_5d; - struct pthreadpool_5d_tile_1d_params parallelize_5d_tile_1d; - struct pthreadpool_5d_tile_2d_params parallelize_5d_tile_2d; - struct pthreadpool_6d_params parallelize_6d; - struct pthreadpool_6d_tile_1d_params parallelize_6d_tile_1d; - struct pthreadpool_6d_tile_2d_params parallelize_6d_tile_2d; - } params; - /** - * Copy of the flags passed to a parallelization function. - */ - pthreadpool_atomic_uint32_t flags; + /** + * The entry point function to call for each thread in the thread pool for + * parallelization tasks. + */ + pthreadpool_atomic_void_p thread_function; + /** + * The function to call for each item. + */ + pthreadpool_atomic_void_p task; + /** + * The first argument to the item processing function. + */ + pthreadpool_atomic_void_p argument; + /** + * Additional parallelization parameters. + * These parameters are specific for each thread_function. + */ + union { + struct pthreadpool_1d_with_uarch_params parallelize_1d_with_uarch; + struct pthreadpool_1d_tile_1d_params parallelize_1d_tile_1d; + struct pthreadpool_1d_tile_1d_dynamic_params parallelize_1d_tile_1d_dynamic; + struct pthreadpool_2d_params parallelize_2d; + struct pthreadpool_2d_tile_1d_params parallelize_2d_tile_1d; + struct pthreadpool_2d_tile_1d_with_uarch_params + parallelize_2d_tile_1d_with_uarch; + struct pthreadpool_2d_tile_1d_dynamic_params parallelize_2d_tile_1d_dynamic; + struct pthreadpool_2d_tile_2d_params parallelize_2d_tile_2d; + struct pthreadpool_2d_tile_2d_with_uarch_params + parallelize_2d_tile_2d_with_uarch; + struct pthreadpool_2d_tile_2d_dynamic_params parallelize_2d_tile_2d_dynamic; + struct pthreadpool_2d_tile_2d_dynamic_with_uarch_params + parallelize_2d_tile_2d_dynamic_with_uarch; + struct pthreadpool_3d_params parallelize_3d; + struct pthreadpool_3d_tile_1d_params parallelize_3d_tile_1d; + struct pthreadpool_3d_tile_1d_with_uarch_params + parallelize_3d_tile_1d_with_uarch; + struct pthreadpool_3d_tile_2d_params parallelize_3d_tile_2d; + struct pthreadpool_3d_tile_2d_with_uarch_params + parallelize_3d_tile_2d_with_uarch; + struct pthreadpool_3d_tile_2d_dynamic_params parallelize_3d_tile_2d_dynamic; + struct pthreadpool_3d_tile_2d_dynamic_with_uarch_params + parallelize_3d_tile_2d_dynamic_with_uarch; + struct pthreadpool_4d_params parallelize_4d; + struct pthreadpool_4d_tile_1d_params parallelize_4d_tile_1d; + struct pthreadpool_4d_tile_2d_params parallelize_4d_tile_2d; + struct pthreadpool_4d_tile_2d_with_uarch_params + parallelize_4d_tile_2d_with_uarch; + struct pthreadpool_4d_tile_2d_dynamic_params parallelize_4d_tile_2d_dynamic; + struct pthreadpool_4d_tile_2d_dynamic_with_uarch_params + parallelize_4d_tile_2d_dynamic_with_uarch; + struct pthreadpool_5d_params parallelize_5d; + struct pthreadpool_5d_tile_1d_params parallelize_5d_tile_1d; + struct pthreadpool_5d_tile_2d_params parallelize_5d_tile_2d; + struct pthreadpool_6d_params parallelize_6d; + struct pthreadpool_6d_tile_1d_params parallelize_6d_tile_1d; + struct pthreadpool_6d_tile_2d_params parallelize_6d_tile_2d; + } params; + /** + * Copy of the flags passed to a parallelization function. + */ + pthreadpool_atomic_uint32_t flags; #if PTHREADPOOL_USE_CONDVAR || PTHREADPOOL_USE_FUTEX - /** - * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads. - */ - pthread_mutex_t execution_mutex; + /** + * Serializes concurrent calls to @a pthreadpool_parallelize_* from different + * threads. + */ + pthread_mutex_t execution_mutex; #endif #if PTHREADPOOL_USE_GCD - /** - * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads. - */ - dispatch_semaphore_t execution_semaphore; + /** + * Serializes concurrent calls to @a pthreadpool_parallelize_* from different + * threads. + */ + dispatch_semaphore_t execution_semaphore; #endif #if PTHREADPOOL_USE_EVENT - /** - * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads. - */ - HANDLE execution_mutex; + /** + * Serializes concurrent calls to @a pthreadpool_parallelize_* from different + * threads. + */ + HANDLE execution_mutex; #endif #if PTHREADPOOL_USE_CONDVAR - /** - * Guards access to the @a active_threads variable. - */ - pthread_mutex_t completion_mutex; - /** - * Condition variable to wait until all threads complete an operation (until @a active_threads is zero). - */ - pthread_cond_t completion_condvar; - /** - * Guards access to the @a command variable. - */ - pthread_mutex_t command_mutex; - /** - * Condition variable to wait for change of the @a command variable. - */ - pthread_cond_t command_condvar; + /** + * Guards access to the @a active_threads variable. + */ + pthread_mutex_t completion_mutex; + /** + * Condition variable to wait until all threads complete an operation (until + * @a active_threads is zero). + */ + pthread_cond_t completion_condvar; + /** + * Guards access to the @a command variable. + */ + pthread_mutex_t command_mutex; + /** + * Condition variable to wait for change of the @a command variable. + */ + pthread_cond_t command_condvar; #endif #if PTHREADPOOL_USE_EVENT - /** - * Events to wait on until all threads complete an operation (until @a active_threads is zero). - * To avoid race conditions due to spin-lock synchronization, we use two events and switch event in use after every - * submitted command according to the high bit of the command word. - */ - HANDLE completion_event[2]; - /** - * Events to wait on for change of the @a command variable. - * To avoid race conditions due to spin-lock synchronization, we use two events and switch event in use after every - * submitted command according to the high bit of the command word. - */ - HANDLE command_event[2]; + /** + * Events to wait on until all threads complete an operation (until @a + * active_threads is zero). To avoid race conditions due to spin-lock + * synchronization, we use two events and switch event in use after every + * submitted command according to the high bit of the command word. + */ + HANDLE completion_event[2]; + /** + * Events to wait on for change of the @a command variable. + * To avoid race conditions due to spin-lock synchronization, we use two + * events and switch event in use after every submitted command according to + * the high bit of the command word. + */ + HANDLE command_event[2]; #endif - /** - * FXdiv divisor for the number of threads in the thread pool. - * This struct never change after pthreadpool_create. - */ - struct fxdiv_divisor_size_t threads_count; - /** - * Thread information structures that immediately follow this structure. - */ - struct thread_info threads[]; + /** + * FXdiv divisor for the number of threads in the thread pool. + * This struct never change after pthreadpool_create. + */ + struct fxdiv_divisor_size_t threads_count; + /** + * Thread information structures that immediately follow this structure. + */ + struct thread_info threads[]; }; -PTHREADPOOL_STATIC_ASSERT(sizeof(struct pthreadpool) % PTHREADPOOL_CACHELINE_SIZE == 0, - "pthreadpool structure must occupy an integer number of cache lines (64 bytes)"); +PTHREADPOOL_STATIC_ASSERT(sizeof(struct pthreadpool) % + PTHREADPOOL_CACHELINE_SIZE == + 0, + "pthreadpool structure must occupy an integer number " + "of cache lines (64 bytes)"); PTHREADPOOL_INTERNAL struct pthreadpool* pthreadpool_allocate( - size_t threads_count); + size_t threads_count); PTHREADPOOL_INTERNAL void pthreadpool_deallocate( - struct pthreadpool* threadpool); + struct pthreadpool* threadpool); -typedef void (*thread_function_t)(struct pthreadpool* threadpool, struct thread_info* thread); +typedef void (*thread_function_t)(struct pthreadpool* threadpool, + struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_parallelize( - struct pthreadpool* threadpool, - thread_function_t thread_function, - const void* params, - size_t params_size, - void* task, - void* context, - size_t linear_range, - uint32_t flags); + struct pthreadpool* threadpool, thread_function_t thread_function, + const void* params, size_t params_size, void* task, void* context, + size_t linear_range, uint32_t flags); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_1d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_1d_with_uarch_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_1d_with_uarch_with_thread_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); -PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); +PTHREADPOOL_INTERNAL void +pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath( - struct pthreadpool* threadpool, - struct thread_info* thread); + struct pthreadpool* threadpool, struct thread_info* thread); + +#endif // __PTHREADPOOL_SRC_THREADPOOL_OBJECT_H_ diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h index 970210b..2e33d90 100644 --- a/src/threadpool-utils.h +++ b/src/threadpool-utils.h @@ -1,124 +1,149 @@ -#pragma once +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef __PTHREADPOOL_SRC_THREADPOOL_UTILS_H_ +#define __PTHREADPOOL_SRC_THREADPOOL_UTILS_H_ #include -#include #include +#include /* SSE-specific headers */ -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) && !defined(_M_ARM64EC) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - #include +#if defined(__SSE__) || defined(__x86_64__) || \ + defined(_M_X64) && !defined(_M_ARM64EC) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#include #endif /* MSVC-specific headers */ #if defined(_MSC_VER) - #include +#include #endif - struct fpu_state { -#if defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM) - uint32_t fpscr; -#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) - uint64_t fpcr; -#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - uint32_t mxcsr; +#if defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ + (__ARM_FP != 0) || \ + defined(_MSC_VER) && defined(_M_ARM) + uint32_t fpscr; +#elif defined(__GNUC__) && defined(__aarch64__) || \ + defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) + uint64_t fpcr; +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + uint32_t mxcsr; #else - char unused; + char unused; #endif }; static inline struct fpu_state get_fpu_state() { - struct fpu_state state = { 0 }; + struct fpu_state state = {0}; #if defined(_MSC_VER) && defined(_M_ARM) - state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0); + state.fpscr = (uint32_t)_MoveFromCoprocessor(10, 7, 1, 0, 0); #elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) - state.fpcr = (uint64_t) _ReadStatusReg(0x5A20); -#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - state.mxcsr = (uint32_t) _mm_getcsr(); -#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) - __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); + state.fpcr = (uint64_t)_ReadStatusReg(0x5A20); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + state.mxcsr = (uint32_t)_mm_getcsr(); +#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ + (__ARM_FP != 0) + __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r"(state.fpscr)); #elif defined(__GNUC__) && defined(__aarch64__) - __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr)); + __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r"(state.fpcr)); #endif - return state; + return state; } static inline void set_fpu_state(const struct fpu_state state) { #if defined(_MSC_VER) && defined(_M_ARM) - _MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0); + _MoveToCoprocessor((int)state.fpscr, 10, 7, 1, 0, 0); #elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) - _WriteStatusReg(0x5A20, (__int64) state.fpcr); -#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) - __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); + _WriteStatusReg(0x5A20, (__int64)state.fpcr); +#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ + (__ARM_FP != 0) + __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r"(state.fpscr)); #elif defined(__GNUC__) && defined(__aarch64__) - __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); -#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - _mm_setcsr((unsigned int) state.mxcsr); + __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r"(state.fpcr)); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + _mm_setcsr((unsigned int)state.mxcsr); #endif } static inline void disable_fpu_denormals() { #if defined(_MSC_VER) && defined(_M_ARM) - int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); - fpscr |= 0x1000000; - _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); + int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); + fpscr |= 0x1000000; + _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); #elif defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC)) - __int64 fpcr = _ReadStatusReg(0x5A20); - fpcr |= 0x1080000; - _WriteStatusReg(0x5A20, fpcr); -#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) - uint32_t fpscr; - #if defined(__thumb__) && !defined(__thumb2__) - __asm__ __volatile__( - "VMRS %[fpscr], fpscr\n" - "ORRS %[fpscr], %[bitmask]\n" - "VMSR fpscr, %[fpscr]\n" - : [fpscr] "=l" (fpscr) - : [bitmask] "l" (0x1000000) - : "cc"); - #else - __asm__ __volatile__( - "VMRS %[fpscr], fpscr\n" - "ORR %[fpscr], #0x1000000\n" - "VMSR fpscr, %[fpscr]\n" - : [fpscr] "=r" (fpscr)); - #endif + __int64 fpcr = _ReadStatusReg(0x5A20); + fpcr |= 0x1080000; + _WriteStatusReg(0x5A20, fpcr); +#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ + (__ARM_FP != 0) + uint32_t fpscr; +#if defined(__thumb__) && !defined(__thumb2__) + __asm__ __volatile__( + "VMRS %[fpscr], fpscr\n" + "ORRS %[fpscr], %[bitmask]\n" + "VMSR fpscr, %[fpscr]\n" + : [fpscr] "=l"(fpscr) + : [bitmask] "l"(0x1000000) + : "cc"); +#else + __asm__ __volatile__( + "VMRS %[fpscr], fpscr\n" + "ORR %[fpscr], #0x1000000\n" + "VMSR fpscr, %[fpscr]\n" + : [fpscr] "=r"(fpscr)); +#endif #elif defined(__GNUC__) && defined(__aarch64__) - uint64_t fpcr; - __asm__ __volatile__( - "MRS %[fpcr], fpcr\n" - "ORR %w[fpcr], %w[fpcr], 0x1000000\n" - "ORR %w[fpcr], %w[fpcr], 0x80000\n" - "MSR fpcr, %[fpcr]\n" - : [fpcr] "=r" (fpcr)); -#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - _mm_setcsr(_mm_getcsr() | 0x8040); + uint64_t fpcr; + __asm__ __volatile__( + "MRS %[fpcr], fpcr\n" + "ORR %w[fpcr], %w[fpcr], 0x1000000\n" + "ORR %w[fpcr], %w[fpcr], 0x80000\n" + "MSR fpcr, %[fpcr]\n" + : [fpcr] "=r"(fpcr)); +#elif defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 1) + _mm_setcsr(_mm_getcsr() | 0x8040); #endif } static inline size_t modulo_decrement(size_t i, size_t n) { - /* Wrap modulo n, if needed */ - if (i == 0) { - i = n; - } - /* Decrement input variable */ - return i - 1; + /* Wrap modulo n, if needed */ + if (i == 0) { + i = n; + } + /* Decrement input variable */ + return i - 1; } static inline size_t divide_round_up(size_t dividend, size_t divisor) { - assert(divisor != 0); - if (dividend % divisor == 0) { - return dividend / divisor; - } else { - return dividend / divisor + 1; - } + assert(divisor != 0); + if (dividend % divisor == 0) { + return dividend / divisor; + } else { + return dividend / divisor + 1; + } } /* Windows headers define min and max macros; undefine it here */ #ifdef min - #undef min +#undef min +#endif +#ifdef max +#undef max #endif -static inline size_t min(size_t a, size_t b) { - return a < b ? a : b; -} +static inline size_t min(size_t a, size_t b) { return a < b ? a : b; } +static inline size_t max(size_t a, size_t b) { return a > b ? a : b; } + +#endif // __PTHREADPOOL_SRC_THREADPOOL_UTILS_H_ diff --git a/src/windows.c b/src/windows.c index 9c141b6..d3e9575 100644 --- a/src/windows.c +++ b/src/windows.c @@ -1,3 +1,12 @@ +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + /* Standard C headers */ #include #include @@ -6,6 +15,7 @@ #include /* Configuration header */ +#include #include "threadpool-common.h" /* Windows headers */ @@ -22,346 +32,361 @@ #include "threadpool-object.h" #include "threadpool-utils.h" - -static void checkin_worker_thread(struct pthreadpool* threadpool, uint32_t event_index) { - if (pthreadpool_decrement_fetch_acquire_release_size_t(&threadpool->active_threads) == 0) { - SetEvent(threadpool->completion_event[event_index]); - } +static void checkin_worker_thread(struct pthreadpool* threadpool, + uint32_t event_index) { + if (pthreadpool_decrement_fetch_acquire_release_size_t( + &threadpool->active_threads) == 0) { + SetEvent(threadpool->completion_event[event_index]); + } } -static void wait_worker_threads(struct pthreadpool* threadpool, uint32_t event_index) { - /* Initial check */ - size_t active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads); - if (active_threads == 0) { - return; - } - - /* Spin-wait */ - for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - pthreadpool_yield(); - - active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads); - if (active_threads == 0) { - return; - } - } - - /* Fall-back to event wait */ - const DWORD wait_status = WaitForSingleObject(threadpool->completion_event[event_index], INFINITE); - assert(wait_status == WAIT_OBJECT_0); - assert(pthreadpool_load_relaxed_size_t(&threadpool->active_threads) == 0); +static void wait_worker_threads(struct pthreadpool* threadpool, + uint32_t event_index) { + /* Initial check */ + size_t active_threads = + pthreadpool_load_acquire_size_t(&threadpool->active_threads); + if (active_threads == 0) { + return; + } + + /* Spin-wait */ + for (uint32_t i = 0; i < PTHREADPOOL_SPIN_WAIT_ITERATIONS; i++) { + pthreadpool_yield(i); + + active_threads = + pthreadpool_load_acquire_size_t(&threadpool->active_threads); + if (active_threads == 0) { + return; + } + } + + /* Fall-back to event wait */ + const DWORD wait_status = + WaitForSingleObject(threadpool->completion_event[event_index], INFINITE); + assert(wait_status == WAIT_OBJECT_0); + assert(pthreadpool_load_relaxed_size_t(&threadpool->active_threads) == 0); } -static uint32_t wait_for_new_command( - struct pthreadpool* threadpool, - uint32_t last_command, - uint32_t last_flags) -{ - uint32_t command = pthreadpool_load_acquire_uint32_t(&threadpool->command); - if (command != last_command) { - return command; - } - - if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { - /* Spin-wait loop */ - for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - pthreadpool_yield(); - - command = pthreadpool_load_acquire_uint32_t(&threadpool->command); - if (command != last_command) { - return command; - } - } - } - - /* Spin-wait disabled or timed out, fall back to event wait */ - const uint32_t event_index = (last_command >> 31); - const DWORD wait_status = WaitForSingleObject(threadpool->command_event[event_index], INFINITE); - assert(wait_status == WAIT_OBJECT_0); - - command = pthreadpool_load_relaxed_uint32_t(&threadpool->command); - assert(command != last_command); - return command; +static uint32_t wait_for_new_command(struct pthreadpool* threadpool, + uint32_t last_command, + uint32_t last_flags) { + uint32_t command = pthreadpool_load_acquire_uint32_t(&threadpool->command); + if (command != last_command) { + return command; + } + + if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { + /* Spin-wait loop */ + for (uint32_t i = 0; i < PTHREADPOOL_SPIN_WAIT_ITERATIONS; i++) { + pthreadpool_yield(i); + + command = pthreadpool_load_acquire_uint32_t(&threadpool->command); + if (command != last_command) { + return command; + } + } + } + + /* Spin-wait disabled or timed out, fall back to event wait */ + const uint32_t event_index = (last_command >> 31); + const DWORD wait_status = + WaitForSingleObject(threadpool->command_event[event_index], INFINITE); + assert(wait_status == WAIT_OBJECT_0); + + command = pthreadpool_load_relaxed_uint32_t(&threadpool->command); + assert(command != last_command); + return command; } static DWORD WINAPI thread_main(LPVOID arg) { - struct thread_info* thread = (struct thread_info*) arg; - struct pthreadpool* threadpool = thread->threadpool; - uint32_t last_command = threadpool_command_init; - struct fpu_state saved_fpu_state = { 0 }; - uint32_t flags = 0; - - /* Check in */ - checkin_worker_thread(threadpool, 0); - - /* Monitor new commands and act accordingly */ - for (;;) { - uint32_t command = wait_for_new_command(threadpool, last_command, flags); - pthreadpool_fence_acquire(); - - flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); - - /* Process command */ - switch (command & THREADPOOL_COMMAND_MASK) { - case threadpool_command_parallelize: - { - const thread_function_t thread_function = - (thread_function_t) pthreadpool_load_relaxed_void_p(&threadpool->thread_function); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - - thread_function(threadpool, thread); - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - break; - } - case threadpool_command_shutdown: - /* Exit immediately: the master thread is waiting on pthread_join */ - return 0; - case threadpool_command_init: - /* To inhibit compiler warning */ - break; - } - /* Notify the master thread that we finished processing */ - const uint32_t event_index = command >> 31; - checkin_worker_thread(threadpool, event_index); - /* Update last command */ - last_command = command; - }; - return 0; + struct thread_info* thread = (struct thread_info*)arg; + struct pthreadpool* threadpool = thread->threadpool; + uint32_t last_command = threadpool_command_init; + struct fpu_state saved_fpu_state = {0}; + uint32_t flags = 0; + + /* Check in */ + checkin_worker_thread(threadpool, 0); + + /* Monitor new commands and act accordingly */ + for (;;) { + uint32_t command = wait_for_new_command(threadpool, last_command, flags); + pthreadpool_fence_acquire(); + + flags = pthreadpool_load_relaxed_uint32_t(&threadpool->flags); + + /* Process command */ + switch (command & THREADPOOL_COMMAND_MASK) { + case threadpool_command_parallelize: { + const thread_function_t thread_function = + (thread_function_t)pthreadpool_load_relaxed_void_p( + &threadpool->thread_function); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + + thread_function(threadpool, thread); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + break; + } + case threadpool_command_shutdown: + /* Exit immediately: the master thread is waiting on pthread_join */ + return 0; + case threadpool_command_init: + /* To inhibit compiler warning */ + break; + } + /* Notify the master thread that we finished processing */ + const uint32_t event_index = command >> 31; + checkin_worker_thread(threadpool, event_index); + /* Update last command */ + last_command = command; + }; + return 0; } struct pthreadpool* pthreadpool_create(size_t threads_count) { - if (threads_count == 0) { - SYSTEM_INFO system_info; - ZeroMemory(&system_info, sizeof(system_info)); - GetSystemInfo(&system_info); - threads_count = (size_t) system_info.dwNumberOfProcessors; - } - - struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); - if (threadpool == NULL) { - return NULL; - } - threadpool->threads_count = fxdiv_init_size_t(threads_count); - for (size_t tid = 0; tid < threads_count; tid++) { - threadpool->threads[tid].thread_number = tid; - threadpool->threads[tid].threadpool = threadpool; - } - - /* Thread pool with a single thread computes everything on the caller thread. */ - if (threads_count > 1) { - threadpool->execution_mutex = CreateMutexW( - NULL /* mutex attributes */, - FALSE /* initially owned */, - NULL /* name */); - for (size_t i = 0; i < 2; i++) { - threadpool->completion_event[i] = CreateEventW( - NULL /* event attributes */, - TRUE /* manual-reset event: yes */, - FALSE /* initial state: nonsignaled */, - NULL /* name */); - threadpool->command_event[i] = CreateEventW( - NULL /* event attributes */, - TRUE /* manual-reset event: yes */, - FALSE /* initial state: nonsignaled */, - NULL /* name */); - } - - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count - 1 /* caller thread */); - - /* Caller thread serves as worker #0. Thus, we create system threads starting with worker #1. */ - for (size_t tid = 1; tid < threads_count; tid++) { - threadpool->threads[tid].thread_handle = CreateThread( - NULL /* thread attributes */, - 0 /* stack size: default */, - &thread_main, - &threadpool->threads[tid], - 0 /* creation flags */, - NULL /* thread id */); - } - - /* Wait until all threads initialize */ - wait_worker_threads(threadpool, 0); - } - return threadpool; + if (threads_count == 0) { + SYSTEM_INFO system_info; + ZeroMemory(&system_info, sizeof(system_info)); + GetSystemInfo(&system_info); + threads_count = (size_t)system_info.dwNumberOfProcessors; + } + + struct pthreadpool* threadpool = pthreadpool_allocate(threads_count); + if (threadpool == NULL) { + return NULL; + } + threadpool->threads_count = fxdiv_init_size_t(threads_count); + for (size_t tid = 0; tid < threads_count; tid++) { + threadpool->threads[tid].thread_number = tid; + threadpool->threads[tid].threadpool = threadpool; + } + + /* Thread pool with a single thread computes everything on the caller thread. + */ + if (threads_count > 1) { + threadpool->execution_mutex = + CreateMutexW(NULL /* mutex attributes */, FALSE /* initially owned */, + NULL /* name */); + for (size_t i = 0; i < 2; i++) { + threadpool->completion_event[i] = CreateEventW( + NULL /* event attributes */, TRUE /* manual-reset event: yes */, + FALSE /* initial state: non-signaled */, NULL /* name */); + threadpool->command_event[i] = CreateEventW( + NULL /* event attributes */, TRUE /* manual-reset event: yes */, + FALSE /* initial state: non-signaled */, NULL /* name */); + } + + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count - 1 /* caller thread */); + + /* Caller thread serves as worker #0. Thus, we create system threads + * starting with worker #1. */ + for (size_t tid = 1; tid < threads_count; tid++) { + threadpool->threads[tid].thread_handle = CreateThread( + NULL /* thread attributes */, 0 /* stack size: default */, + &thread_main, &threadpool->threads[tid], 0 /* creation flags */, + NULL /* thread id */); + } + + /* Wait until all threads initialize */ + wait_worker_threads(threadpool, 0); + } + return threadpool; } PTHREADPOOL_INTERNAL void pthreadpool_parallelize( - struct pthreadpool* threadpool, - thread_function_t thread_function, - const void* params, - size_t params_size, - void* task, - void* context, - size_t linear_range, - uint32_t flags) -{ - assert(threadpool != NULL); - assert(thread_function != NULL); - assert(task != NULL); - assert(linear_range > 1); - - /* Protect the global threadpool structures */ - const DWORD wait_status = WaitForSingleObject(threadpool->execution_mutex, INFINITE); - assert(wait_status == WAIT_OBJECT_0); - - /* Setup global arguments */ - pthreadpool_store_relaxed_void_p(&threadpool->thread_function, (void*) thread_function); - pthreadpool_store_relaxed_void_p(&threadpool->task, task); - pthreadpool_store_relaxed_void_p(&threadpool->argument, context); - pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); - - const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count.value - 1 /* caller thread */); - - if (params_size != 0) { - CopyMemory(&threadpool->params, params, params_size); - pthreadpool_fence_release(); - } - - /* Spread the work between threads */ - const struct fxdiv_result_size_t range_params = fxdiv_divide_size_t(linear_range, threads_count); - size_t range_start = 0; - for (size_t tid = 0; tid < threads_count.value; tid++) { - struct thread_info* thread = &threadpool->threads[tid]; - const size_t range_length = range_params.quotient + (size_t) (tid < range_params.remainder); - const size_t range_end = range_start + range_length; - pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); - pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); - pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); - - /* The next subrange starts where the previous ended */ - range_start = range_end; - } - - /* - * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, task, argument, flags) - * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask - * to ensure the unmasked command is different then the last command, because worker threads - * monitor for change in the unmasked command. - */ - const uint32_t old_command = pthreadpool_load_relaxed_uint32_t(&threadpool->command); - const uint32_t new_command = ~(old_command | THREADPOOL_COMMAND_MASK) | threadpool_command_parallelize; - - /* - * Reset the command event for the next command. - * It is important to reset the event before writing out the new command, because as soon as the worker threads - * observe the new command, they may process it and switch to waiting on the next command event. - * - * Note: the event is different from the command event signalled in this update. - */ - const uint32_t event_index = (old_command >> 31); - BOOL reset_event_status = ResetEvent(threadpool->command_event[event_index ^ 1]); - assert(reset_event_status != FALSE); - - /* - * Store the command with release semantics to guarantee that if a worker thread observes - * the new command value, it also observes the updated command parameters. - * - * Note: release semantics is necessary, because the workers might be waiting in a spin-loop - * rather than on the event object. - */ - pthreadpool_store_release_uint32_t(&threadpool->command, new_command); - - /* - * Signal the event to wake up the threads. - * Event in use must be switched after every submitted command to avoid race conditions. - * Choose the event based on the high bit of the command, which is flipped on every update. - */ - const BOOL set_event_status = SetEvent(threadpool->command_event[event_index]); - assert(set_event_status != FALSE); - - /* Save and modify FPU denormals control, if needed */ - struct fpu_state saved_fpu_state = { 0 }; - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - saved_fpu_state = get_fpu_state(); - disable_fpu_denormals(); - } - - /* Do computations as worker #0 */ - thread_function(threadpool, &threadpool->threads[0]); - - /* Restore FPU denormals control, if needed */ - if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { - set_fpu_state(saved_fpu_state); - } - - /* - * Wait until the threads finish computation - * Use the complementary event because it corresponds to the new command. - */ - wait_worker_threads(threadpool, event_index ^ 1); - - /* - * Reset the completion event for the next command. - * Note: the event is different from the one used for waiting in this update. - */ - reset_event_status = ResetEvent(threadpool->completion_event[event_index]); - assert(reset_event_status != FALSE); - - /* Make changes by other threads visible to this thread */ - pthreadpool_fence_acquire(); - - /* Unprotect the global threadpool structures */ - const BOOL release_mutex_status = ReleaseMutex(threadpool->execution_mutex); - assert(release_mutex_status != FALSE); + struct pthreadpool* threadpool, thread_function_t thread_function, + const void* params, size_t params_size, void* task, void* context, + size_t linear_range, uint32_t flags) { + assert(threadpool != NULL); + assert(thread_function != NULL); + assert(task != NULL); + assert(linear_range > 1); + + /* Protect the global threadpool structures */ + const DWORD wait_status = + WaitForSingleObject(threadpool->execution_mutex, INFINITE); + assert(wait_status == WAIT_OBJECT_0); + + /* Setup global arguments */ + pthreadpool_store_relaxed_void_p(&threadpool->thread_function, + (void*)thread_function); + pthreadpool_store_relaxed_void_p(&threadpool->task, task); + pthreadpool_store_relaxed_void_p(&threadpool->argument, context); + pthreadpool_store_relaxed_uint32_t(&threadpool->flags, flags); + + const struct fxdiv_divisor_size_t threads_count = threadpool->threads_count; + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count.value - 1 /* caller thread */); + + if (params_size != 0) { + CopyMemory(&threadpool->params, params, params_size); + pthreadpool_fence_release(); + } + + /* Spread the work between threads */ + const struct fxdiv_result_size_t range_params = + fxdiv_divide_size_t(linear_range, threads_count); + size_t range_start = 0; + for (size_t tid = 0; tid < threads_count.value; tid++) { + struct thread_info* thread = &threadpool->threads[tid]; + const size_t range_length = + range_params.quotient + (size_t)(tid < range_params.remainder); + const size_t range_end = range_start + range_length; + pthreadpool_store_relaxed_size_t(&thread->range_start, range_start); + pthreadpool_store_relaxed_size_t(&thread->range_end, range_end); + pthreadpool_store_relaxed_size_t(&thread->range_length, range_length); + + /* The next subrange starts where the previous ended */ + range_start = range_end; + } + + /* + * Update the threadpool command. + * Imporantly, do it after initializing command parameters (range, task, + * argument, flags) + * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in + * command mask to ensure the unmasked command is different then the last + * command, because worker threads monitor for change in the unmasked command. + */ + const uint32_t old_command = + pthreadpool_load_relaxed_uint32_t(&threadpool->command); + const uint32_t new_command = + ~(old_command | THREADPOOL_COMMAND_MASK) | threadpool_command_parallelize; + + /* + * Reset the command event for the next command. + * It is important to reset the event before writing out the new command, + * because as soon as the worker threads observe the new command, they may + * process it and switch to waiting on the next command event. + * + * Note: the event is different from the command event signalled in this + * update. + */ + const uint32_t event_index = (old_command >> 31); + BOOL reset_event_status = + ResetEvent(threadpool->command_event[event_index ^ 1]); + assert(reset_event_status != FALSE); + + /* + * Store the command with release semantics to guarantee that if a worker + * thread observes the new command value, it also observes the updated command + * parameters. + * + * Note: release semantics is necessary, because the workers might be waiting + * in a spin-loop rather than on the event object. + */ + pthreadpool_store_release_uint32_t(&threadpool->command, new_command); + + /* + * Signal the event to wake up the threads. + * Event in use must be switched after every submitted command to avoid race + * conditions. Choose the event based on the high bit of the command, which is + * flipped on every update. + */ + const BOOL set_event_status = + SetEvent(threadpool->command_event[event_index]); + assert(set_event_status != FALSE); + + /* Save and modify FPU denormals control, if needed */ + struct fpu_state saved_fpu_state = {0}; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + + /* Do computations as worker #0 */ + thread_function(threadpool, &threadpool->threads[0]); + + /* Restore FPU denormals control, if needed */ + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + + /* + * Wait until the threads finish computation + * Use the complementary event because it corresponds to the new command. + */ + wait_worker_threads(threadpool, event_index ^ 1); + + /* + * Reset the completion event for the next command. + * Note: the event is different from the one used for waiting in this update. + */ + reset_event_status = ResetEvent(threadpool->completion_event[event_index]); + assert(reset_event_status != FALSE); + + /* Make changes by other threads visible to this thread */ + pthreadpool_fence_acquire(); + + /* Unprotect the global threadpool structures */ + const BOOL release_mutex_status = ReleaseMutex(threadpool->execution_mutex); + assert(release_mutex_status != FALSE); } void pthreadpool_destroy(struct pthreadpool* threadpool) { - if (threadpool != NULL) { - const size_t threads_count = threadpool->threads_count.value; - if (threads_count > 1) { - pthreadpool_store_relaxed_size_t(&threadpool->active_threads, threads_count - 1 /* caller thread */); - - /* - * Store the command with release semantics to guarantee that if a worker thread observes - * the new command value, it also observes the updated active_threads values. - */ - const uint32_t old_command = pthreadpool_load_relaxed_uint32_t(&threadpool->command); - pthreadpool_store_release_uint32_t(&threadpool->command, threadpool_command_shutdown); - - /* - * Signal the event to wake up the threads. - * Event in use must be switched after every submitted command to avoid race conditions. - * Choose the event based on the high bit of the command, which is flipped on every update. - */ - const uint32_t event_index = (old_command >> 31); - const BOOL set_event_status = SetEvent(threadpool->command_event[event_index]); - assert(set_event_status != FALSE); - - /* Wait until all threads return */ - for (size_t tid = 1; tid < threads_count; tid++) { - const HANDLE thread_handle = threadpool->threads[tid].thread_handle; - if (thread_handle != NULL) { - const DWORD wait_status = WaitForSingleObject(thread_handle, INFINITE); - assert(wait_status == WAIT_OBJECT_0); - - const BOOL close_status = CloseHandle(thread_handle); - assert(close_status != FALSE); - } - } - - /* Release resources */ - if (threadpool->execution_mutex != NULL) { - const BOOL close_status = CloseHandle(threadpool->execution_mutex); - assert(close_status != FALSE); - } - for (size_t i = 0; i < 2; i++) { - if (threadpool->command_event[i] != NULL) { - const BOOL close_status = CloseHandle(threadpool->command_event[i]); - assert(close_status != FALSE); - } - if (threadpool->completion_event[i] != NULL) { - const BOOL close_status = CloseHandle(threadpool->completion_event[i]); - assert(close_status != FALSE); - } - } - } - pthreadpool_deallocate(threadpool); - } + if (threadpool != NULL) { + const size_t threads_count = threadpool->threads_count.value; + if (threads_count > 1) { + pthreadpool_store_relaxed_size_t(&threadpool->active_threads, + threads_count - 1 /* caller thread */); + + /* + * Store the command with release semantics to guarantee that if a worker + * thread observes the new command value, it also observes the updated + * active_threads values. + */ + const uint32_t old_command = + pthreadpool_load_relaxed_uint32_t(&threadpool->command); + pthreadpool_store_release_uint32_t(&threadpool->command, + threadpool_command_shutdown); + + /* + * Signal the event to wake up the threads. + * Event in use must be switched after every submitted command to avoid + * race conditions. Choose the event based on the high bit of the command, + * which is flipped on every update. + */ + const uint32_t event_index = (old_command >> 31); + const BOOL set_event_status = + SetEvent(threadpool->command_event[event_index]); + assert(set_event_status != FALSE); + + /* Wait until all threads return */ + for (size_t tid = 1; tid < threads_count; tid++) { + const HANDLE thread_handle = threadpool->threads[tid].thread_handle; + if (thread_handle != NULL) { + const DWORD wait_status = + WaitForSingleObject(thread_handle, INFINITE); + assert(wait_status == WAIT_OBJECT_0); + + const BOOL close_status = CloseHandle(thread_handle); + assert(close_status != FALSE); + } + } + + /* Release resources */ + if (threadpool->execution_mutex != NULL) { + const BOOL close_status = CloseHandle(threadpool->execution_mutex); + assert(close_status != FALSE); + } + for (size_t i = 0; i < 2; i++) { + if (threadpool->command_event[i] != NULL) { + const BOOL close_status = CloseHandle(threadpool->command_event[i]); + assert(close_status != FALSE); + } + if (threadpool->completion_event[i] != NULL) { + const BOOL close_status = + CloseHandle(threadpool->completion_event[i]); + assert(close_status != FALSE); + } + } + } + pthreadpool_deallocate(threadpool); + } } diff --git a/test/pthreadpool-cxx.cc b/test/pthreadpool-cxx.cc index 2939fad..4943ca0 100644 --- a/test/pthreadpool-cxx.cc +++ b/test/pthreadpool-cxx.cc @@ -1,15 +1,23 @@ -#include - -#include +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include #include #include #include +#include +#include +#include -typedef std::unique_ptr auto_pthreadpool_t; - +typedef std::unique_ptr + auto_pthreadpool_t; const size_t kParallelize1DRange = 1223; const size_t kParallelize1DTile1DRange = 1303; @@ -90,1636 +98,2574 @@ const size_t kParallelize6DTile2DRangeN = 23; const size_t kParallelize6DTile2DTileM = 3; const size_t kParallelize6DTile2DTileN = 2; - TEST(Parallelize1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - [](size_t) { }, - kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), [](size_t) {}, kParallelize1DRange); } TEST(Parallelize1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - [](size_t i) { - EXPECT_LT(i, kParallelize1DRange); - }, - kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), [](size_t i) { EXPECT_LT(i, kParallelize1DRange); }, + kParallelize1DRange); } TEST(Parallelize1D, AllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - [&indicators](size_t i) { - indicators[i].store(true, std::memory_order_relaxed); - }, - kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), + [&indicators](size_t i) { + indicators[i].store(true, std::memory_order_relaxed); + }, + kParallelize1DRange); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } - TEST(Parallelize1D, EachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - [&counters](size_t i) { - counters[i].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), + [&counters](size_t i) { + counters[i].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize1DRange); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - [](size_t, size_t) { }, - kParallelize1DTile1DRange, kParallelize1DTile1DTile); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), [](size_t, size_t) {}, kParallelize1DTile1DRange, + kParallelize1DTile1DTile); } TEST(Parallelize1DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - [](size_t start_i, size_t tile_i) { - EXPECT_LT(start_i, kParallelize1DTile1DRange); - EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); - }, - kParallelize1DTile1DRange, kParallelize1DTile1DTile); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + [](size_t start_i, size_t tile_i) { + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); } TEST(Parallelize1DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - [](size_t start_i, size_t tile_i) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize1DTile1DTile); - EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); - EXPECT_EQ(tile_i, std::min(kParallelize1DTile1DTile, kParallelize1DTile1DRange - start_i)); - }, - kParallelize1DTile1DRange, kParallelize1DTile1DTile); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + [](size_t start_i, size_t tile_i) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize1DTile1DTile); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); + EXPECT_EQ(tile_i, + std::min(kParallelize1DTile1DTile, + kParallelize1DTile1DRange - start_i)); + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); } TEST(Parallelize1DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize1DTile1DRange); + std::vector indicators(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - [&indicators](size_t start_i, size_t tile_i) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - indicators[i].store(true, std::memory_order_relaxed); - } - }, - kParallelize1DTile1DRange, kParallelize1DTile1DTile); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + [&indicators](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + indicators[i].store(true, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } TEST(Parallelize1DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize1DTile1DRange); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - [&counters](size_t start_i, size_t tile_i) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - counters[i].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize1DTile1DRange, kParallelize1DTile1DTile); - - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + [&counters](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + counters[i].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + +TEST(Parallelize1DTile1DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), [](size_t, size_t) {}, kParallelize1DTile1DRange, + kParallelize1DTile1DTile); +} + +TEST(Parallelize1DTile1DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + [](size_t start_i, size_t tile_i) { + EXPECT_GE(start_i, 0); + EXPECT_GT(tile_i, 0); + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); +} + +TEST(Parallelize1DTile1DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + [&indicators](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + indicators[i].store(true, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +TEST(Parallelize1DTile1DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + [&counters](size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + counters[i].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize1DTile1DRange, kParallelize1DTile1DTile); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - [](size_t, size_t) { }, - kParallelize2DRangeI, kParallelize2DRangeJ); + pthreadpool_parallelize_2d( + threadpool.get(), [](size_t, size_t) {}, kParallelize2DRangeI, + kParallelize2DRangeJ); } TEST(Parallelize2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - [](size_t i, size_t j) { - EXPECT_LT(i, kParallelize2DRangeI); - EXPECT_LT(j, kParallelize2DRangeJ); - }, - kParallelize2DRangeI, kParallelize2DRangeJ); + pthreadpool_parallelize_2d( + threadpool.get(), + [](size_t i, size_t j) { + EXPECT_LT(i, kParallelize2DRangeI); + EXPECT_LT(j, kParallelize2DRangeJ); + }, + kParallelize2DRangeI, kParallelize2DRangeJ); } TEST(Parallelize2D, AllItemsProcessed) { - std::vector indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector indicators(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - [&indicators](size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - indicators[linear_idx].store(true, std::memory_order_relaxed); - }, - kParallelize2DRangeI, kParallelize2DRangeJ); + pthreadpool_parallelize_2d( + threadpool.get(), + [&indicators](size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + }, + kParallelize2DRangeI, kParallelize2DRangeJ); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2D, EachItemProcessedOnce) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d( - threadpool.get(), - [&counters](size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize2DRangeI, kParallelize2DRangeJ); - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d( + threadpool.get(), + [&counters](size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize2DRangeI, kParallelize2DRangeJ); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - [](size_t, size_t, size_t) { }, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), [](size_t, size_t, size_t) {}, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); } TEST(Parallelize2DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - [](size_t i, size_t start_j, size_t tile_j) { - EXPECT_LT(i, kParallelize2DTile1DRangeI); - EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); - EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); - }, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + [](size_t i, size_t start_j, size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); } TEST(Parallelize2DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - [](size_t i, size_t start_j, size_t tile_j) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j)); - }, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + [](size_t i, size_t start_j, size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, + std::min(kParallelize2DTile1DTileJ, + kParallelize2DTile1DRangeJ - start_j)); + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); } TEST(Parallelize2DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - [&indicators](size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - }, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - [&counters](size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), [](size_t, size_t, size_t) {}, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); +} + +TEST(Parallelize2DTile1DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [](size_t i, size_t start_j, size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); +} + +TEST(Parallelize2DTile1DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_dynamic( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - [](size_t, size_t, size_t, size_t) { }, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t) {}, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); } TEST(Parallelize2DTile2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_LT(start_i, kParallelize2DTile2DRangeI); - EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); - EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); - EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); - }, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); } TEST(Parallelize2DTile2D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize2DTile2DTileI); - EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); - EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i)); - - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j)); - }, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DTileI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(tile_i, + std::min(kParallelize2DTile2DTileI, + kParallelize2DTile2DRangeI - start_i)); + + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_EQ(tile_j, + std::min(kParallelize2DTile2DTileJ, + kParallelize2DTile2DRangeJ - start_j)); + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); } TEST(Parallelize2DTile2D, AllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - [&indicators](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } - }, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [&indicators](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile2D, EachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - [&counters](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } - }, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [&counters](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), [](size_t, size_t, size_t, size_t) {}, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); +} + +TEST(Parallelize2DTile2DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + [](size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_GT(tile_j, 0); + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); +} + +TEST(Parallelize2DTile2DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + [&indicators](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + [&counters](size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize3D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), - [](size_t, size_t, size_t) { }, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); + pthreadpool_parallelize_3d( + threadpool.get(), [](size_t, size_t, size_t) {}, kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK); } TEST(Parallelize3D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), - [](size_t i, size_t j, size_t k) { - EXPECT_LT(i, kParallelize3DRangeI); - EXPECT_LT(j, kParallelize3DRangeJ); - EXPECT_LT(k, kParallelize3DRangeK); - }, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); + pthreadpool_parallelize_3d( + threadpool.get(), + [](size_t i, size_t j, size_t k) { + EXPECT_LT(i, kParallelize3DRangeI); + EXPECT_LT(j, kParallelize3DRangeJ); + EXPECT_LT(k, kParallelize3DRangeK); + }, + kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); } TEST(Parallelize3D, AllItemsProcessed) { - std::vector indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - indicators[linear_idx].store(true, std::memory_order_relaxed); - }, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + indicators[linear_idx].store(true, std::memory_order_relaxed); + }, + kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3D, EachItemProcessedOnce) { - std::vector counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - [](size_t, size_t, size_t, size_t) { }, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), [](size_t, size_t, size_t, size_t) {}, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK); } TEST(Parallelize3DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); - }, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); + }, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK); } TEST(Parallelize3DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k)); - }, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, + std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); + }, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK); } TEST(Parallelize3DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - }, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * + kParallelize3DTile1DRangeK + + k; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * + kParallelize3DTile1DRangeK + + k; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t) { }, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t) {}, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); } TEST(Parallelize3DTile2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - [](size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile2DRangeI); - EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile2DRangeK); - EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); - EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); - }, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); } TEST(Parallelize3DTile2D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - [](size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j)); - - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile2DTileK); - EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k)); - }, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(tile_j, + std::min(kParallelize3DTile2DTileJ, + kParallelize3DTile2DRangeJ - start_j)); + + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_EQ(tile_k, + std::min(kParallelize3DTile2DTileK, + kParallelize3DTile2DRangeK - start_k)); + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); } TEST(Parallelize3DTile2D, AllItemsProcessed) { - std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - [&indicators](size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } - }, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile2D, EachItemProcessedOnce) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - [&counters](size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } - }, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t) {}, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); +} + +TEST(Parallelize3DTile2DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_GT(tile_k, 0); + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); +} + +TEST(Parallelize3DTile2DDynamic, AllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, EachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(2), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * + kParallelize3DTile2DRangeK + + k; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize4D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d( - threadpool.get(), - [](size_t, size_t, size_t, size_t) { }, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL); + pthreadpool_parallelize_4d( + threadpool.get(), [](size_t, size_t, size_t, size_t) {}, + kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, + kParallelize4DRangeL); } TEST(Parallelize4D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l) { - EXPECT_LT(i, kParallelize4DRangeI); - EXPECT_LT(j, kParallelize4DRangeJ); - EXPECT_LT(k, kParallelize4DRangeK); - EXPECT_LT(l, kParallelize4DRangeL); - }, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL); + pthreadpool_parallelize_4d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l) { + EXPECT_LT(i, kParallelize4DRangeI); + EXPECT_LT(j, kParallelize4DRangeJ); + EXPECT_LT(k, kParallelize4DRangeK); + EXPECT_LT(l, kParallelize4DRangeL); + }, + kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, + kParallelize4DRangeL); } TEST(Parallelize4D, AllItemsProcessed) { - std::vector indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - indicators[linear_idx].store(true, std::memory_order_relaxed); - }, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + indicators[linear_idx].store(true, std::memory_order_relaxed); + }, + kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, + kParallelize4DRangeL); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4D, EachItemProcessedOnce) { - std::vector counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, + kParallelize4DRangeL); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t) { }, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t) {}, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL); } TEST(Parallelize4DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - EXPECT_LT(i, kParallelize4DTile1DRangeI); - EXPECT_LT(j, kParallelize4DTile1DRangeJ); - EXPECT_LT(k, kParallelize4DTile1DRangeK); - EXPECT_LT(start_l, kParallelize4DTile1DRangeL); - EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL); - }, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile1DRangeI); + EXPECT_LT(j, kParallelize4DTile1DRangeJ); + EXPECT_LT(k, kParallelize4DTile1DRangeK); + EXPECT_LT(start_l, kParallelize4DTile1DRangeL); + EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL); + }, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL); } TEST(Parallelize4DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize4DTile1DTileL); - EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize4DTile1DTileL, kParallelize4DTile1DRangeL - start_l)); - }, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile1DTileL); + EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0); + EXPECT_EQ(tile_l, + std::min(kParallelize4DTile1DTileL, + kParallelize4DTile1DRangeL - start_l)); + }, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL); } TEST(Parallelize4DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - }, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t start_l, + size_t tile_l) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); } TEST(Parallelize4DTile2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_LT(i, kParallelize4DTile2DRangeI); - EXPECT_LT(j, kParallelize4DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize4DTile2DRangeK); - EXPECT_LT(start_l, kParallelize4DTile2DRangeL); - EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); - EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); - }, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); } TEST(Parallelize4DTile2D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize4DTile2DTileK); - EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k)); - - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize4DTile2DTileL); - EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l)); - }, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize4DTile2DTileK); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + EXPECT_EQ(tile_k, + std::min(kParallelize4DTile2DTileK, + kParallelize4DTile2DRangeK - start_k)); + + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile2DTileL); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); + EXPECT_EQ(tile_l, + std::min(kParallelize4DTile2DTileL, + kParallelize4DTile2DRangeL - start_l)); + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); } TEST(Parallelize4DTile2D, AllItemsProcessed) { - std::vector indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } - }, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4DTile2D, EachItemProcessedOnce) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } - }, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, ThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); +} + +TEST(Parallelize4DTile2DDynamic, AllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); +} + +TEST(Parallelize4DTile2DDynamic, UniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + + EXPECT_GT(tile_l, 0); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); +} + +TEST(Parallelize4DTile2DDynamic, AllItemsProcessed) { + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, EachItemProcessedOnce) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize5D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t) { }, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM); + pthreadpool_parallelize_5d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t) {}, + kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, + kParallelize5DRangeL, kParallelize5DRangeM); } TEST(Parallelize5D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t m) { - EXPECT_LT(i, kParallelize5DRangeI); - EXPECT_LT(j, kParallelize5DRangeJ); - EXPECT_LT(k, kParallelize5DRangeK); - EXPECT_LT(l, kParallelize5DRangeL); - EXPECT_LT(m, kParallelize5DRangeM); - }, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t m) { + EXPECT_LT(i, kParallelize5DRangeI); + EXPECT_LT(j, kParallelize5DRangeJ); + EXPECT_LT(k, kParallelize5DRangeK); + EXPECT_LT(l, kParallelize5DRangeL); + EXPECT_LT(m, kParallelize5DRangeM); + }, + kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, + kParallelize5DRangeL, kParallelize5DRangeM); } TEST(Parallelize5D, AllItemsProcessed) { - std::vector indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - indicators[linear_idx].store(true, std::memory_order_relaxed); - }, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + indicators[linear_idx].store(true, std::memory_order_relaxed); + }, + kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, + kParallelize5DRangeL, kParallelize5DRangeM); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5D, EachItemProcessedOnce) { - std::vector counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l, size_t m) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l, size_t m) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, + kParallelize5DRangeL, kParallelize5DRangeM); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM); } TEST(Parallelize5DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - EXPECT_LT(i, kParallelize5DTile1DRangeI); - EXPECT_LT(j, kParallelize5DTile1DRangeJ); - EXPECT_LT(k, kParallelize5DTile1DRangeK); - EXPECT_LT(l, kParallelize5DTile1DRangeL); - EXPECT_LT(start_m, kParallelize5DTile1DRangeM); - EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM); - }, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + EXPECT_LT(i, kParallelize5DTile1DRangeI); + EXPECT_LT(j, kParallelize5DTile1DRangeJ); + EXPECT_LT(k, kParallelize5DTile1DRangeK); + EXPECT_LT(l, kParallelize5DTile1DRangeL); + EXPECT_LT(start_m, kParallelize5DTile1DRangeM); + EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM); + }, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM); } TEST(Parallelize5DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize5DTile1DTileM); - EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize5DTile1DTileM, kParallelize5DTile1DRangeM - start_m)); - }, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize5DTile1DTileM); + EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0); + EXPECT_EQ(tile_m, + std::min(kParallelize5DTile1DTileM, + kParallelize5DTile1DRangeM - start_m)); + }, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM); } TEST(Parallelize5DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - }, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + [](size_t, size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM); } TEST(Parallelize5DTile2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - EXPECT_LT(i, kParallelize5DTile2DRangeI); - EXPECT_LT(j, kParallelize5DTile2DRangeJ); - EXPECT_LT(k, kParallelize5DTile2DRangeK); - EXPECT_LT(start_l, kParallelize5DTile2DRangeL); - EXPECT_LT(start_m, kParallelize5DTile2DRangeM); - EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL); - EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM); - }, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, + size_t tile_l, size_t tile_m) { + EXPECT_LT(i, kParallelize5DTile2DRangeI); + EXPECT_LT(j, kParallelize5DTile2DRangeJ); + EXPECT_LT(k, kParallelize5DTile2DRangeK); + EXPECT_LT(start_l, kParallelize5DTile2DRangeL); + EXPECT_LT(start_m, kParallelize5DTile2DRangeM); + EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL); + EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM); + }, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM); } TEST(Parallelize5DTile2D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize5DTile2DTileL); - EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize5DTile2DTileL, kParallelize5DTile2DRangeL - start_l)); - - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize5DTile2DTileM); - EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize5DTile2DTileM, kParallelize5DTile2DRangeM - start_m)); - }, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, + size_t tile_l, size_t tile_m) { + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize5DTile2DTileL); + EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0); + EXPECT_EQ(tile_l, + std::min(kParallelize5DTile2DTileL, + kParallelize5DTile2DRangeL - start_l)); + + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize5DTile2DTileM); + EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0); + EXPECT_EQ(tile_m, + std::min(kParallelize5DTile2DTileM, + kParallelize5DTile2DRangeM - start_m)); + }, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM); } TEST(Parallelize5DTile2D, AllItemsProcessed) { - std::vector indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } - }, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t start_l, + size_t start_m, size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5DTile2D, EachItemProcessedOnce) { - std::vector counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } - }, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t start_l, size_t start_m, + size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize6D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d(threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); + pthreadpool_parallelize_6d( + threadpool.get(), [](size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, + kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); } TEST(Parallelize6D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - EXPECT_LT(i, kParallelize6DRangeI); - EXPECT_LT(j, kParallelize6DRangeJ); - EXPECT_LT(k, kParallelize6DRangeK); - EXPECT_LT(l, kParallelize6DRangeL); - EXPECT_LT(m, kParallelize6DRangeM); - EXPECT_LT(n, kParallelize6DRangeN); - }, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { + EXPECT_LT(i, kParallelize6DRangeI); + EXPECT_LT(j, kParallelize6DRangeJ); + EXPECT_LT(k, kParallelize6DRangeK); + EXPECT_LT(l, kParallelize6DRangeL); + EXPECT_LT(m, kParallelize6DRangeM); + EXPECT_LT(n, kParallelize6DRangeN); + }, + kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, + kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); } TEST(Parallelize6D, AllItemsProcessed) { - std::vector indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - indicators[linear_idx].store(true, std::memory_order_relaxed); - }, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + indicators[linear_idx].store(true, std::memory_order_relaxed); + }, + kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, + kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6D, EachItemProcessedOnce) { - std::vector counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - }, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + }, + kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, + kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_1d(threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + [](size_t, size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN); } TEST(Parallelize6DTile1D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - EXPECT_LT(i, kParallelize6DTile1DRangeI); - EXPECT_LT(j, kParallelize6DTile1DRangeJ); - EXPECT_LT(k, kParallelize6DTile1DRangeK); - EXPECT_LT(l, kParallelize6DTile1DRangeL); - EXPECT_LT(m, kParallelize6DTile1DRangeM); - EXPECT_LT(start_n, kParallelize6DTile1DRangeN); - EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN); - }, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, + size_t tile_n) { + EXPECT_LT(i, kParallelize6DTile1DRangeI); + EXPECT_LT(j, kParallelize6DTile1DRangeJ); + EXPECT_LT(k, kParallelize6DTile1DRangeK); + EXPECT_LT(l, kParallelize6DTile1DRangeL); + EXPECT_LT(m, kParallelize6DTile1DRangeM); + EXPECT_LT(start_n, kParallelize6DTile1DRangeN); + EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN); + }, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN); } TEST(Parallelize6DTile1D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - EXPECT_GT(tile_n, 0); - EXPECT_LE(tile_n, kParallelize6DTile1DTileN); - EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0); - EXPECT_EQ(tile_n, std::min(kParallelize6DTile1DTileN, kParallelize6DTile1DRangeN - start_n)); - }, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, + size_t tile_n) { + EXPECT_GT(tile_n, 0); + EXPECT_LE(tile_n, kParallelize6DTile1DTileN); + EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0); + EXPECT_EQ(tile_n, + std::min(kParallelize6DTile1DTileN, + kParallelize6DTile1DRangeN - start_n)); + }, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN); } TEST(Parallelize6DTile1D, AllItemsProcessed) { - std::vector indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - }, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l, size_t m, + size_t start_n, size_t tile_n) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + }, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, EachItemProcessedOnce) { - std::vector counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - }, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l, size_t m, + size_t start_n, size_t tile_n) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + }, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, ThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_2d(threadpool.get(), - [](size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { }, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + [](size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) {}, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); } TEST(Parallelize6DTile2D, AllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - EXPECT_LT(i, kParallelize6DTile2DRangeI); - EXPECT_LT(j, kParallelize6DTile2DRangeJ); - EXPECT_LT(k, kParallelize6DTile2DRangeK); - EXPECT_LT(l, kParallelize6DTile2DRangeL); - EXPECT_LT(start_m, kParallelize6DTile2DRangeM); - EXPECT_LT(start_n, kParallelize6DTile2DRangeN); - EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM); - EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN); - }, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, + size_t tile_m, size_t tile_n) { + EXPECT_LT(i, kParallelize6DTile2DRangeI); + EXPECT_LT(j, kParallelize6DTile2DRangeJ); + EXPECT_LT(k, kParallelize6DTile2DRangeK); + EXPECT_LT(l, kParallelize6DTile2DRangeL); + EXPECT_LT(start_m, kParallelize6DTile2DRangeM); + EXPECT_LT(start_n, kParallelize6DTile2DRangeN); + EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM); + EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN); + }, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); } TEST(Parallelize6DTile2D, UniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize6DTile2DTileM); - EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize6DTile2DTileM, kParallelize6DTile2DRangeM - start_m)); - - EXPECT_GT(tile_n, 0); - EXPECT_LE(tile_n, kParallelize6DTile2DTileN); - EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0); - EXPECT_EQ(tile_n, std::min(kParallelize6DTile2DTileN, kParallelize6DTile2DRangeN - start_n)); - }, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + [](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, + size_t tile_m, size_t tile_n) { + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize6DTile2DTileM); + EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0); + EXPECT_EQ(tile_m, + std::min(kParallelize6DTile2DTileM, + kParallelize6DTile2DRangeM - start_m)); + + EXPECT_GT(tile_n, 0); + EXPECT_LE(tile_n, kParallelize6DTile2DTileN); + EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0); + EXPECT_EQ(tile_n, + std::min(kParallelize6DTile2DTileN, + kParallelize6DTile2DRangeN - start_n)); + }, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); } TEST(Parallelize6DTile2D, AllItemsProcessed) { - std::vector indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - [&indicators](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } - }, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + [&indicators](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } + }, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, EachItemProcessedOnce) { - std::vector counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - [&counters](size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } - }, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + [&counters](size_t i, size_t j, size_t k, size_t l, size_t start_m, + size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } + }, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } diff --git a/test/pthreadpool.cc b/test/pthreadpool.cc index 2fc67e8..687c9c9 100644 --- a/test/pthreadpool.cc +++ b/test/pthreadpool.cc @@ -1,15 +1,27 @@ -#include +// Copyright (c) 2017 Facebook Inc. +// Copyright (c) 2015-2017 Georgia Institute of Technology +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. #include #include #include +#include // NOLINT #include +#include #include +#include // NOLINT +#include +#include -typedef std::unique_ptr auto_pthreadpool_t; - +typedef std::unique_ptr + auto_pthreadpool_t; const size_t kParallelize1DRange = 1223; const size_t kParallelize1DTile1DRange = 1303; @@ -97,9825 +109,13870 @@ const size_t kIncrementIterations6D = 3; const uint32_t kMaxUArchIndex = 0; const uint32_t kDefaultUArchIndex = 42; - TEST(CreateAndDestroy, NullThreadPool) { - pthreadpool* threadpool = nullptr; - pthreadpool_destroy(threadpool); + pthreadpool* threadpool = nullptr; + pthreadpool_destroy(threadpool); } TEST(CreateAndDestroy, SingleThreadPool) { - pthreadpool* threadpool = pthreadpool_create(1); - ASSERT_TRUE(threadpool); - pthreadpool_destroy(threadpool); + pthreadpool* threadpool = pthreadpool_create(1); + ASSERT_TRUE(threadpool); + pthreadpool_destroy(threadpool); } TEST(CreateAndDestroy, MultiThreadPool) { - pthreadpool* threadpool = pthreadpool_create(0); - ASSERT_TRUE(threadpool); - pthreadpool_destroy(threadpool); + pthreadpool* threadpool = pthreadpool_create(0); + ASSERT_TRUE(threadpool); + pthreadpool_destroy(threadpool); } -static void ComputeNothing1D(void*, size_t) { -} +static void ComputeNothing1D(void*, size_t) {} TEST(Parallelize1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d(threadpool.get(), - ComputeNothing1D, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d(threadpool.get(), ComputeNothing1D, nullptr, + kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - ComputeNothing1D, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d(threadpool.get(), ComputeNothing1D, nullptr, + kParallelize1DRange, /*flags=*/0); } static void CheckBounds1D(void*, size_t i) { - EXPECT_LT(i, kParallelize1DRange); + EXPECT_LT(i, kParallelize1DRange); } TEST(Parallelize1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - CheckBounds1D, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d(threadpool.get(), CheckBounds1D, nullptr, + kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - CheckBounds1D, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d(threadpool.get(), CheckBounds1D, nullptr, + kParallelize1DRange, /*flags=*/0); } static void SetTrue1D(std::atomic_bool* processed_indicators, size_t i) { - processed_indicators[i].store(true, std::memory_order_relaxed); + processed_indicators[i].store(true, std::memory_order_relaxed); } TEST(Parallelize1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(SetTrue1D), - static_cast(indicators.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(SetTrue1D), + static_cast(indicators.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } TEST(Parallelize1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(SetTrue1D), - static_cast(indicators.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(SetTrue1D), + static_cast(indicators.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } static void Increment1D(std::atomic_int* processed_counters, size_t i) { - processed_counters[i].fetch_add(1, std::memory_order_relaxed); + processed_counters[i].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(Increment1D), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(Increment1D), + static_cast(counters.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(Increment1D), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(Increment1D), + static_cast(counters.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(Increment1D), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(Increment1D), + static_cast(counters.data()), kParallelize1DRange, + /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } TEST(Parallelize1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(Increment1D), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d( + threadpool.get(), reinterpret_cast(Increment1D), + static_cast(counters.data()), kParallelize1DRange, + /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } static void IncrementSame1D(std::atomic_int* num_processed_items, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(IncrementSame1D), - static_cast(&num_processed_items), - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast(IncrementSame1D), + static_cast(&num_processed_items), kParallelize1DRange, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } static void WorkImbalance1D(std::atomic_int* num_processed_items, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance1D), - static_cast(&num_processed_items), - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance1D), + static_cast(&num_processed_items), kParallelize1DRange, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } -static void ComputeNothing1DWithThread(void*, size_t, size_t) { -} +static void ComputeNothing1DWithThread(void*, size_t, size_t) {} TEST(Parallelize1DWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_thread(threadpool.get(), - ComputeNothing1DWithThread, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread(threadpool.get(), + ComputeNothing1DWithThread, nullptr, + kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1DWithThread, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - ComputeNothing1DWithThread, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread(threadpool.get(), + ComputeNothing1DWithThread, nullptr, + kParallelize1DRange, /*flags=*/0); } static void CheckBounds1DWithThread(void*, size_t, size_t i) { - EXPECT_LT(i, kParallelize1DRange); + EXPECT_LT(i, kParallelize1DRange); } TEST(Parallelize1DWithThread, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - CheckBounds1DWithThread, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread(threadpool.get(), + CheckBounds1DWithThread, nullptr, + kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1DWithThread, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - CheckBounds1DWithThread, - nullptr, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread(threadpool.get(), + CheckBounds1DWithThread, nullptr, + kParallelize1DRange, /*flags=*/0); } -static void SetTrue1DWithThread(std::atomic_bool* processed_indicators, size_t, size_t i) { - processed_indicators[i].store(true, std::memory_order_relaxed); +static void SetTrue1DWithThread(std::atomic_bool* processed_indicators, size_t, + size_t i) { + processed_indicators[i].store(true, std::memory_order_relaxed); } TEST(Parallelize1DWithThread, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue1DWithThread), - static_cast(indicators.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast(SetTrue1DWithThread), + static_cast(indicators.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } TEST(Parallelize1DWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue1DWithThread), - static_cast(indicators.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast(SetTrue1DWithThread), + static_cast(indicators.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } -static void Increment1DWithThread(std::atomic_int* processed_counters, size_t, size_t i) { - processed_counters[i].fetch_add(1, std::memory_order_relaxed); +static void Increment1DWithThread(std::atomic_int* processed_counters, size_t, + size_t i) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1DWithThread, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment1DWithThread), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment1DWithThread), + static_cast(counters.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DWithThread, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment1DWithThread), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment1DWithThread), + static_cast(counters.data()), kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment1DWithThread), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment1DWithThread), + static_cast(counters.data()), kParallelize1DRange, + /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } TEST(Parallelize1DWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment1DWithThread), - static_cast(counters.data()), - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment1DWithThread), + static_cast(counters.data()), kParallelize1DRange, + /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } -static void IncrementSame1DWithThread(std::atomic_int* num_processed_items, size_t, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); +static void IncrementSame1DWithThread(std::atomic_int* num_processed_items, + size_t, size_t i) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1DWithThread, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(IncrementSame1DWithThread), - static_cast(&num_processed_items), - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + IncrementSame1DWithThread), + static_cast(&num_processed_items), kParallelize1DRange, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } -static void WorkImbalance1DWithThread(std::atomic_int* num_processed_items, size_t, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance1DWithThread(std::atomic_int* num_processed_items, + size_t, size_t i) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize1DWithThread, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(WorkImbalance1DWithThread), - static_cast(&num_processed_items), - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + WorkImbalance1DWithThread), + static_cast(&num_processed_items), kParallelize1DRange, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } -static void CheckThreadIndexValid1DWithThread(const size_t* num_threads, size_t thread_index, size_t) { - EXPECT_LE(thread_index, *num_threads); +static void CheckThreadIndexValid1DWithThread(const size_t* num_threads, + size_t thread_index, size_t) { + EXPECT_LE(thread_index, *num_threads); } TEST(Parallelize1DWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - if (num_threads <= 1) { - GTEST_SKIP(); - } + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + if (num_threads <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_thread( - threadpool.get(), - reinterpret_cast(CheckThreadIndexValid1DWithThread), - static_cast(&num_threads), - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_thread( + threadpool.get(), + reinterpret_cast( + CheckThreadIndexValid1DWithThread), + static_cast(&num_threads), kParallelize1DRange, /*flags=*/0); } -static void ComputeNothing1DWithUArch(void*, uint32_t, size_t) { -} +static void ComputeNothing1DWithUArch(void*, uint32_t, size_t) {} TEST(Parallelize1DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_uarch(threadpool.get(), - ComputeNothing1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), ComputeNothing1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - ComputeNothing1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), ComputeNothing1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } static void CheckUArch1DWithUArch(void*, uint32_t uarch_index, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize1DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_uarch(threadpool.get(), - CheckUArch1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), CheckUArch1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - CheckUArch1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), CheckUArch1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } static void CheckBounds1DWithUArch(void*, uint32_t, size_t i) { - EXPECT_LT(i, kParallelize1DRange); + EXPECT_LT(i, kParallelize1DRange); } TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - CheckBounds1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), CheckBounds1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - CheckBounds1DWithUArch, - nullptr, - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), CheckBounds1DWithUArch, nullptr, kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); } -static void SetTrue1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i) { - processed_indicators[i].store(true, std::memory_order_relaxed); +static void SetTrue1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, + size_t i) { + processed_indicators[i].store(true, std::memory_order_relaxed); } TEST(Parallelize1DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(SetTrue1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } TEST(Parallelize1DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DRange); + std::vector indicators(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(SetTrue1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } -static void Increment1DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i) { - processed_counters[i].fetch_add(1, std::memory_order_relaxed); +static void Increment1DWithUArch(std::atomic_int* processed_counters, uint32_t, + size_t i) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(Increment1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(Increment1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(Increment1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } TEST(Parallelize1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DRange); + std::vector counters(kParallelize1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(Increment1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize1DRange, /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } -static void IncrementSame1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); +static void IncrementSame1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize1DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(IncrementSame1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } -static void WorkImbalance1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize1DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, - kMaxUArchIndex, - kParallelize1DRange, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); + pthreadpool_parallelize_1d_with_uarch( + threadpool.get(), + reinterpret_cast(WorkImbalance1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize1DRange, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DRange); } -static void ComputeNothing1DTile1D(void*, size_t, size_t) { -} +static void ComputeNothing1DTile1D(void*, size_t, size_t) {} TEST(Parallelize1DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d(threadpool.get(), - ComputeNothing1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), ComputeNothing1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } TEST(Parallelize1DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - ComputeNothing1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), ComputeNothing1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } static void CheckBounds1DTile1D(void*, size_t start_i, size_t tile_i) { - EXPECT_LT(start_i, kParallelize1DTile1DRange); - EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); } TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - CheckBounds1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckBounds1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - CheckBounds1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckBounds1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } static void CheckTiling1DTile1D(void*, size_t start_i, size_t tile_i) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize1DTile1DTile); - EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); - EXPECT_EQ(tile_i, std::min(kParallelize1DTile1DTile, kParallelize1DTile1DRange - start_i)); + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize1DTile1DTile); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); + EXPECT_EQ(tile_i, std::min(kParallelize1DTile1DTile, + kParallelize1DTile1DRange - start_i)); } TEST(Parallelize1DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - CheckTiling1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckTiling1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } TEST(Parallelize1DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - CheckTiling1DTile1D, - nullptr, - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), CheckTiling1DTile1D, + nullptr, kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); } -static void SetTrue1DTile1D(std::atomic_bool* processed_indicators, size_t start_i, size_t tile_i) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - processed_indicators[i].store(true, std::memory_order_relaxed); - } +static void SetTrue1DTile1D(std::atomic_bool* processed_indicators, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_indicators[i].store(true, std::memory_order_relaxed); + } } TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DTile1DRange); + std::vector indicators(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue1DTile1D), - static_cast(indicators.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue1DTile1D), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize1DTile1DRange); + std::vector indicators(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue1DTile1D), - static_cast(indicators.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue1DTile1D), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) - << "Element " << i << " not processed"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } -static void Increment1DTile1D(std::atomic_int* processed_counters, size_t start_i, size_t tile_i) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - processed_counters[i].fetch_add(1, std::memory_order_relaxed); - } +static void Increment1DTile1D(std::atomic_int* processed_counters, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DTile1DRange); + std::vector counters(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment1DTile1D), - static_cast(counters.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DTile1D), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize1DTile1DRange); + std::vector counters(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment1DTile1D), - static_cast(counters.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DTile1D), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } } TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DTile1DRange); + std::vector counters(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment1DTile1D), - static_cast(counters.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DTile1D), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize1DTile1DRange); + std::vector counters(kParallelize1DTile1DRange); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment1DTile1D), - static_cast(counters.data()), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment1DTile1D), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + } - for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { - EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) - << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } } -static void IncrementSame1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } +static void IncrementSame1DTile1D(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize1DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame1DTile1D), - static_cast(&num_processed_items), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame1DTile1D), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); } -static void WorkImbalance1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) { - num_processed_items->fetch_add(tile_i, std::memory_order_relaxed); - if (start_i == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DTile1DRange) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance1DTile1D(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + num_processed_items->fetch_add(tile_i, std::memory_order_relaxed); + if (start_i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize1DTile1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize1DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance1DTile1D), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); +} + +static void ComputeNothing1DDynamic(void*, size_t, size_t) {} + +TEST(Parallelize1DTile1DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), ComputeNothing1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_1d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance1DTile1D), - static_cast(&num_processed_items), - kParallelize1DTile1DRange, kParallelize1DTile1DTile, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), ComputeNothing1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} + +static void CheckBounds1DDynamic(void*, size_t start_i, size_t tile_i) { + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); } -static void ComputeNothing2D(void*, size_t, size_t) { +TEST(Parallelize1DTile1DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), CheckBounds1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} + +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), CheckBounds1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} + +static void CheckTiling1DDynamic(void*, size_t start_i, size_t tile_i) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize1DTile1DRange); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); +} + +TEST(Parallelize1DTile1DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), CheckTiling1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} + +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), CheckTiling1DDynamic, nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, /*flags=*/0); +} + +static void SetTrue1DDynamic(std::atomic_bool* processed_indicators, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_indicators[i].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DTile1DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue1DDynamic), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } } +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue1DDynamic), + static_cast(indicators.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +static void Increment1DDynamic(std::atomic_int* processed_counters, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DTile1DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } +} + +TEST(Parallelize1DTile1DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +TEST(Parallelize1DTile1DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(Increment1DDynamic), + static_cast(counters.data()), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " + << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +static void IncrementSame1DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(IncrementSame1DDynamic), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); +} + +static void WorkImbalance1DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t tile_i) { + num_processed_items->fetch_add(tile_i, std::memory_order_relaxed); + if (start_i == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize1DTile1DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d_dynamic( + threadpool.get(), + reinterpret_cast(WorkImbalance1DDynamic), + static_cast(&num_processed_items), kParallelize1DTile1DRange, + kParallelize1DTile1DTile, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize1DTile1DRange); +} + +static void ComputeNothing2D(void*, size_t, size_t) {} + TEST(Parallelize2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d(threadpool.get(), - ComputeNothing2D, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d(threadpool.get(), ComputeNothing2D, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + /*flags=*/0); } TEST(Parallelize2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - ComputeNothing2D, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d(threadpool.get(), ComputeNothing2D, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + /*flags=*/0); } static void CheckBounds2D(void*, size_t i, size_t j) { - EXPECT_LT(i, kParallelize2DRangeI); - EXPECT_LT(j, kParallelize2DRangeJ); + EXPECT_LT(i, kParallelize2DRangeI); + EXPECT_LT(j, kParallelize2DRangeJ); } TEST(Parallelize2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - CheckBounds2D, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d(threadpool.get(), CheckBounds2D, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + /*flags=*/0); } TEST(Parallelize2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - CheckBounds2D, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d(threadpool.get(), CheckBounds2D, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + /*flags=*/0); } -static void SetTrue2D(std::atomic_bool* processed_indicators, size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue2D(std::atomic_bool* processed_indicators, size_t i, + size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector indicators(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(SetTrue2D), - static_cast(indicators.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(SetTrue2D), + static_cast(indicators.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector indicators(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(SetTrue2D), - static_cast(indicators.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(SetTrue2D), + static_cast(indicators.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } -static void Increment2D(std::atomic_int* processed_counters, size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); +static void Increment2D(std::atomic_int* processed_counters, size_t i, + size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(Increment2D), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(Increment2D), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(Increment2D), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(Increment2D), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(Increment2D), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(Increment2D), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } } TEST(Parallelize2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(Increment2D), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - } + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d( + threadpool.get(), reinterpret_cast(Increment2D), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + } - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } } -static void IncrementSame2D(std::atomic_int* num_processed_items, size_t i, size_t j) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); +static void IncrementSame2D(std::atomic_int* num_processed_items, size_t i, + size_t j) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(IncrementSame2D), - static_cast(&num_processed_items), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ); + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast(IncrementSame2D), + static_cast(&num_processed_items), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DRangeI * kParallelize2DRangeJ); } -static void WorkImbalance2D(std::atomic_int* num_processed_items, size_t i, size_t j) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DRangeI * kParallelize2DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance2D(std::atomic_int* num_processed_items, size_t i, + size_t j) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DRangeI * kParallelize2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance2D), - static_cast(&num_processed_items), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ); + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance2D), + static_cast(&num_processed_items), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DRangeI * kParallelize2DRangeJ); } -static void ComputeNothing2DWithThread(void*, size_t, size_t, size_t) { -} +static void ComputeNothing2DWithThread(void*, size_t, size_t, size_t) {} TEST(Parallelize2DWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_with_thread(threadpool.get(), - ComputeNothing2DWithThread, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), ComputeNothing2DWithThread, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, /*flags=*/0); } TEST(Parallelize2DWithThread, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - ComputeNothing2DWithThread, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), ComputeNothing2DWithThread, nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, /*flags=*/0); } static void CheckBounds2DWithThread(void*, size_t, size_t i, size_t j) { - EXPECT_LT(i, kParallelize2DRangeI); - EXPECT_LT(j, kParallelize2DRangeJ); + EXPECT_LT(i, kParallelize2DRangeI); + EXPECT_LT(j, kParallelize2DRangeJ); } TEST(Parallelize2DWithThread, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - CheckBounds2DWithThread, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), CheckBounds2DWithThread, nullptr, kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); } TEST(Parallelize2DWithThread, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - CheckBounds2DWithThread, - nullptr, - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), CheckBounds2DWithThread, nullptr, kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); } -static void SetTrue2DWithThread(std::atomic_bool* processed_indicators, size_t, size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue2DWithThread(std::atomic_bool* processed_indicators, size_t, + size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize2DWithThread, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector indicators(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue2DWithThread), - static_cast(indicators.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast(SetTrue2DWithThread), + static_cast(indicators.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + std::vector indicators(kParallelize2DRangeI * + kParallelize2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue2DWithThread), - static_cast(indicators.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast(SetTrue2DWithThread), + static_cast(indicators.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } -static void Increment2DWithThread(std::atomic_int* processed_counters, size_t, size_t i, size_t j) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); +static void Increment2DWithThread(std::atomic_int* processed_counters, size_t, + size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize2DWithThread, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DWithThread), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DWithThread), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DWithThread, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DWithThread), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DWithThread), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DWithThread), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DWithThread), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } } TEST(Parallelize2DWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DRangeI * kParallelize2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DWithThread), - static_cast(counters.data()), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -static void IncrementSame2DWithThread(std::atomic_int* num_processed_items, size_t, size_t i, size_t j) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + std::vector counters(kParallelize2DRangeI * + kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DWithThread), + static_cast(counters.data()), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DWithThread(std::atomic_int* num_processed_items, + size_t, size_t i, size_t j) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize2DWithThread, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(IncrementSame2DWithThread), - static_cast(&num_processed_items), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + IncrementSame2DWithThread), + static_cast(&num_processed_items), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DRangeI * kParallelize2DRangeJ); } -static void WorkImbalance2DWithThread(std::atomic_int* num_processed_items, size_t, size_t i, size_t j) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DRangeI * kParallelize2DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance2DWithThread(std::atomic_int* num_processed_items, + size_t, size_t i, size_t j) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DRangeI * kParallelize2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize2DWithThread, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(WorkImbalance2DWithThread), - static_cast(&num_processed_items), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DWithThread), + static_cast(&num_processed_items), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DRangeI * kParallelize2DRangeJ); } -static void CheckThreadIndexValid2DWithThread(const size_t* num_threads, size_t thread_index, size_t, size_t) { - EXPECT_LE(thread_index, *num_threads); +static void CheckThreadIndexValid2DWithThread(const size_t* num_threads, + size_t thread_index, size_t, + size_t) { + EXPECT_LE(thread_index, *num_threads); } TEST(Parallelize2DWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - if (num_threads <= 1) { - GTEST_SKIP(); - } + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + if (num_threads <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_with_thread( - threadpool.get(), - reinterpret_cast(CheckThreadIndexValid2DWithThread), - static_cast(&num_threads), - kParallelize2DRangeI, kParallelize2DRangeJ, - 0 /* flags */); + pthreadpool_parallelize_2d_with_thread( + threadpool.get(), + reinterpret_cast( + CheckThreadIndexValid2DWithThread), + static_cast(&num_threads), kParallelize2DRangeI, + kParallelize2DRangeJ, /*flags=*/0); } -static void ComputeNothing2DTile1D(void*, size_t, size_t, size_t) { -} +static void ComputeNothing2DTile1D(void*, size_t, size_t, size_t) {} TEST(Parallelize2DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d(threadpool.get(), - ComputeNothing2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), ComputeNothing2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - ComputeNothing2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), ComputeNothing2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckBounds2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) { - EXPECT_LT(i, kParallelize2DTile1DRangeI); - EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); - EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); +static void CheckBounds2DTile1D(void*, size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - CheckBounds2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), CheckBounds2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - CheckBounds2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), CheckBounds2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckTiling2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j)); +static void CheckTiling2DTile1D(void*, size_t i, size_t start_j, + size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, + kParallelize2DTile1DRangeJ - start_j)); } TEST(Parallelize2DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - CheckTiling2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), CheckTiling2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - CheckTiling2DTile1D, - nullptr, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), CheckTiling2DTile1D, + nullptr, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -static void SetTrue2DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue2DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1D), - static_cast(indicators.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile1D), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1D), - static_cast(indicators.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile1D), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } -static void Increment2DTile1D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } +static void Increment2DTile1D(std::atomic_int* processed_counters, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment2DTile1D), - static_cast(counters.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1D), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment2DTile1D), - static_cast(counters.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1D), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment2DTile1D), - static_cast(counters.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1D), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } } TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1D), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } +} - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); +TEST(Parallelize2DTile1D, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment2DTile1D), - static_cast(counters.data()), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame2DTile1D), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void IncrementSame2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } +static void WorkImbalance2DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } -TEST(Parallelize2DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); +TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance2DTile1D), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void ComputeNothing2DTile1DDynamic(void*, size_t, size_t, size_t) {} - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame2DTile1D), - static_cast(&num_processed_items), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), ComputeNothing2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -static void WorkImbalance2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) { - num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); - if (i == 0 && start_j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), ComputeNothing2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); +static void CheckBounds2DTile1DDynamic(void*, size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); +} + +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckBounds2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); +} + +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckBounds2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); +} - pthreadpool_parallelize_2d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance2DTile1D), - static_cast(&num_processed_items), - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +static void CheckTiling2DTile1DDynamic(void*, size_t i, size_t start_j, + size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DRangeJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); } -static void ComputeNothing2DTile1DWithUArch(void*, uint32_t, size_t, size_t, size_t) { +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckTiling2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), CheckTiling2DTile1DDynamic, nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); +} + +static void SetTrue2DTile1DDynamic(std::atomic_bool* processed_indicators, + size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile1DDynamic), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile1DDynamic), + static_cast(indicators.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile1DDynamic(std::atomic_int* processed_counters, + size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile1DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DDynamic), + static_cast(counters.data()), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile1DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile1DDynamic), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void WorkImbalance2DTile1DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize2DTile1DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile1DDynamic), + static_cast(&num_processed_items), kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void ComputeNothing2DTile1DWithUArch(void*, uint32_t, size_t, size_t, + size_t) {} + TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch(threadpool.get(), - ComputeNothing2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - ComputeNothing2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckUArch2DTile1DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch2DTile1DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckUArch2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckUArch2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckUArch2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckBounds2DTile1DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t tile_j) { - EXPECT_LT(i, kParallelize2DTile1DRangeI); - EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); - EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); +static void CheckBounds2DTile1DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckBounds2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckBounds2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckBounds2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckTiling2DTile1DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t tile_j) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j)); +static void CheckTiling2DTile1DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, + kParallelize2DTile1DRangeJ - start_j)); } TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckTiling2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - CheckTiling2DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), CheckTiling2DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void SetTrue2DTile1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue2DTile1DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } -} - -static void Increment2DTile1DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile1DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } -} - -TEST(Parallelize2DTile1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -static void IncrementSame2DTile1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile1DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame2DTile1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); -} - -static void WorkImbalance2DTile1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t tile_j) { - num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); - if (i == 0 && start_j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void WorkImbalance2DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize2DTile1DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance2DTile1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + pthreadpool_parallelize_2d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void ComputeNothing2DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t, size_t, size_t) {} TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(threadpool.get(), - ComputeNothing2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - ComputeNothing2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckUArch2DTile1DWithUArchWithThread(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch2DTile1DWithUArchWithThread(void*, uint32_t uarch_index, + size_t, size_t, size_t, + size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckUArch2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckUArch2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckBounds2DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - EXPECT_LT(i, kParallelize2DTile1DRangeI); - EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); - EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); +static void CheckBounds2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); } TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckBounds2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckBounds2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void CheckTiling2DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j)); +static void CheckTiling2DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t start_j, + size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile1DTileJ, + kParallelize2DTile1DRangeJ - start_j)); } TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckTiling2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckTiling2DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling2DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile1DRangeI, + kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, /*flags=*/0); } -static void SetTrue2DTile1DWithUArchWithThread(std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue2DTile1DWithUArchWithThread( + std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1DWithUArchWithThread), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue2DTile1DWithUArchWithThread), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } -} - -static void Increment2DTile1DWithUArchWithThread(std::atomic_int* processed_counters, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } -} - -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } -} - -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } -} - -TEST(Parallelize2DTile1DWithUArchWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment2DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -static void IncrementSame2DTile1DWithUArchWithThread(std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile1DWithUArchWithThread( + std::atomic_int* processed_counters, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile1DRangeI * + kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment2DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(IncrementSame2DTile1DWithUArchWithThread), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); -} - -static void WorkImbalance2DTile1DWithUArchWithThread(std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t start_j, size_t tile_j) { - num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); - if (i == 0 && start_j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void WorkImbalance2DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, + size_t start_j, size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(WorkImbalance2DTile1DWithUArchWithThread), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); } -static void SetThreadTrue2DTile1DWithUArchWithThread(const size_t* num_threads, uint32_t, size_t thread_index, size_t i, size_t start_j, size_t tile_j) { - EXPECT_LE(thread_index, *num_threads); +static void SetThreadTrue2DTile1DWithUArchWithThread(const size_t* num_threads, + uint32_t, + size_t thread_index, + size_t i, size_t start_j, + size_t tile_j) { + EXPECT_LE(thread_index, *num_threads); } TEST(Parallelize2DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetThreadTrue2DTile1DWithUArchWithThread), - static_cast(&num_threads), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetThreadTrue2DTile1DWithUArchWithThread), + static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, + kParallelize2DTile1DTileJ, /*flags=*/0); } -static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) {} TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d(threadpool.get(), - ComputeNothing2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), ComputeNothing2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - ComputeNothing2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), ComputeNothing2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } -static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_LT(start_i, kParallelize2DTile2DRangeI); - EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); - EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); - EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); } TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - CheckBounds2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckBounds2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - CheckBounds2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckBounds2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } -static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize2DTile2DTileI); - EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); - EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i)); +static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DTileI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, + kParallelize2DTile2DRangeI - start_i)); - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j)); + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, + kParallelize2DTile2DRangeJ - start_j)); } TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - CheckTiling2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckTiling2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - CheckTiling2DTile2D, - nullptr, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), CheckTiling2DTile2D, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } -static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } +static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue2DTile2D), - static_cast(indicators.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile2D), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue2DTile2D), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile2D(std::atomic_int* processed_counters, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue2DTile2D), - static_cast(indicators.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment2DTile2D), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } -static void Increment2DTile2D(std::atomic_int* processed_counters, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } +TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(IncrementSame2DTile2D), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } -TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance2DTile2D), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); } -TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +static void ComputeNothing2DDynamic(void*, size_t, size_t, size_t, size_t) {} - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), ComputeNothing2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); +} - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } -} + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } -TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), ComputeNothing2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } -TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment2DTile2D), - static_cast(counters.data()), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -static void IncrementSame2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } +static void CheckBounds2DDynamic(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); } -TEST(Parallelize2DTile2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckBounds2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); +} - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(IncrementSame2DTile2D), - static_cast(&num_processed_items), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckBounds2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); } -static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); - if (start_i == 0 && start_j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void CheckTiling2DDynamic(void*, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DRangeI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DRangeJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); } -TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckTiling2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); +} - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance2DTile2D), - static_cast(&num_processed_items), - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), CheckTiling2DDynamic, nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); +} + +static void SetTrue2DDynamic(std::atomic_bool* processed_indicators, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue2DDynamic), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue2DDynamic), + static_cast(indicators.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DDynamic(std::atomic_int* processed_counters, + size_t start_i, size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile2DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment2DDynamic), + static_cast(counters.data()), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(IncrementSame2DDynamic), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DDynamic(std::atomic_int* num_processed_items, + size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize2DTile2DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(WorkImbalance2DDynamic), + static_cast(&num_processed_items), kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void ComputeNothing2DTile2DDynamicWithUArch(void*, uint32_t, size_t, + size_t, size_t, size_t) {} + +TEST(Parallelize2DTile2DDynamicWithUArch, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +static void CheckUArch2DTile2DDynamicWithUArch(void*, uint32_t uarch_index, + size_t, size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } -static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t) { +TEST(Parallelize2DTile2DDynamicWithUArch, SingleThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +static void CheckBounds2DTile2DDynamicWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +} + +TEST(Parallelize2DTile2DDynamicWithUArch, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +static void CheckTiling2DTile2DDynamicWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_TRUE(tile_i % kParallelize2DTile2DTileI == 0 || + tile_i == kParallelize2DTile2DRangeI - start_i); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + + EXPECT_GT(tile_j, 0); + EXPECT_TRUE(tile_j % kParallelize2DTile2DTileJ == 0 || + tile_j == kParallelize2DTile2DRangeJ - start_j); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +} + +TEST(Parallelize2DTile2DDynamicWithUArch, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling2DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); +} + +static void SetTrue2DTile2DDynamicWithUArch( + std::atomic_bool* processed_indicators, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile2DDynamicWithUArch( + std::atomic_int* processed_counters, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize2DTile2DDynamicWithUArch, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void ComputeNothing2DTile2DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t) {} + TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch(threadpool.get(), - ComputeNothing2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - ComputeNothing2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } -static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch2DTile2DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckUArch2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } -static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_LT(start_i, kParallelize2DTile2DRangeI); - EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); - EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); - EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +static void CheckBounds2DTile2DWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); } TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckBounds2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } -static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - EXPECT_GT(tile_i, 0); - EXPECT_LE(tile_i, kParallelize2DTile2DTileI); - EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); - EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i)); +static void CheckTiling2DTile2DWithUArch(void*, uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DTileI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(tile_i, std::min(kParallelize2DTile2DTileI, + kParallelize2DTile2DRangeI - start_i)); - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j)); + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize2DTile2DTileJ, + kParallelize2DTile2DRangeJ - start_j)); } TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling2DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), CheckTiling2DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize2DTile2DRangeI, + kParallelize2DTile2DRangeJ, kParallelize2DTile2DTileI, + kParallelize2DTile2DTileJ, /*flags=*/0); } -static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } +static void SetTrue2DTile2DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue2DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue2DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ") not processed"; - } - } -} - -static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue2DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile2DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t start_i, size_t start_j, + size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } -} - -TEST(Parallelize2DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment2DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { - const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } -} - -static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - for (size_t i = start_i; i < start_i + tile_i; i++) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile2DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize2DTile2DRangeI * + kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment2DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void IncrementSame2DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame2DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); -} - -static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { - num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); - if (start_i == 0 && start_j == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame2DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void WorkImbalance2DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t start_i, + size_t start_j, size_t tile_i, + size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize2DTile2DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_2d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance2DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, - kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + pthreadpool_parallelize_2d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance2DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); } -static void ComputeNothing3D(void*, size_t, size_t, size_t) { -} +static void ComputeNothing3D(void*, size_t, size_t, size_t) {} TEST(Parallelize3D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d(threadpool.get(), - ComputeNothing3D, - nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, /*flags=*/0); } TEST(Parallelize3D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d( - threadpool.get(), - ComputeNothing3D, - nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d(threadpool.get(), ComputeNothing3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, /*flags=*/0); } static void CheckBounds3D(void*, size_t i, size_t j, size_t k) { - EXPECT_LT(i, kParallelize3DRangeI); - EXPECT_LT(j, kParallelize3DRangeJ); - EXPECT_LT(k, kParallelize3DRangeK); + EXPECT_LT(i, kParallelize3DRangeI); + EXPECT_LT(j, kParallelize3DRangeJ); + EXPECT_LT(k, kParallelize3DRangeK); } TEST(Parallelize3D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), - CheckBounds3D, - nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, /*flags=*/0); } TEST(Parallelize3D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d( - threadpool.get(), - CheckBounds3D, - nullptr, - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d(threadpool.get(), CheckBounds3D, nullptr, + kParallelize3DRangeI, kParallelize3DRangeJ, + kParallelize3DRangeK, /*flags=*/0); } -static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue3D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize3D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + std::vector indicators( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(SetTrue3D), - static_cast(indicators.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(SetTrue3D), + static_cast(indicators.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + std::vector indicators( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(SetTrue3D), - static_cast(indicators.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(SetTrue3D), + static_cast(indicators.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } -static void Increment3D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); +static void Increment3D(std::atomic_int* processed_counters, size_t i, size_t j, + size_t k) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize3D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(Increment3D), - static_cast(counters.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(Increment3D), - static_cast(counters.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(Increment3D), - static_cast(counters.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } } TEST(Parallelize3D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(Increment3D), - static_cast(counters.data()), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + std::vector counters( + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d( + threadpool.get(), reinterpret_cast(Increment3D), + static_cast(counters.data()), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DRangeJ + j) * kParallelize3DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize3D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(IncrementSame3D), - static_cast(&num_processed_items), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + pthreadpool_parallelize_3d( + threadpool.get(), + reinterpret_cast(IncrementSame3D), + static_cast(&num_processed_items), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); } -static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } +static void WorkImbalance3D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d( - threadpool.get(), - reinterpret_cast(WorkImbalance3D), - static_cast(&num_processed_items), - kParallelize3DRangeI, kParallelize3DRangeJ, kParallelize3DRangeK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); + pthreadpool_parallelize_3d( + threadpool.get(), + reinterpret_cast(WorkImbalance3D), + static_cast(&num_processed_items), kParallelize3DRangeI, + kParallelize3DRangeJ, kParallelize3DRangeK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DRangeI * kParallelize3DRangeJ * kParallelize3DRangeK); } -static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing3DTile1D(void*, size_t, size_t, size_t, size_t) {} TEST(Parallelize3DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d(threadpool.get(), - ComputeNothing3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), ComputeNothing3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - ComputeNothing3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), ComputeNothing3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +static void CheckBounds3DTile1D(void*, size_t i, size_t j, size_t start_k, + size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - CheckBounds3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckBounds3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - CheckBounds3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckBounds3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k)); +static void CheckTiling3DTile1D(void*, size_t i, size_t j, size_t start_k, + size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); } TEST(Parallelize3DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - CheckTiling3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckTiling3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - CheckTiling3DTile1D, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), CheckTiling3DTile1D, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue3DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1D), - static_cast(indicators.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile1D), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1D), - static_cast(indicators.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile1D), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile1D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment3DTile1D), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment3DTile1D), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment3DTile1D), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } } TEST(Parallelize3DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment3DTile1D), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment3DTile1D), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile1D), - static_cast(&num_processed_items), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); -} - -static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t tile_k) { - num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame3DTile1D), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); +} + +static void WorkImbalance3DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t tile_k) { + num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile1D), - static_cast(&num_processed_items), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); + pthreadpool_parallelize_3d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance3DTile1D), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void ComputeNothing3DTile1DWithThread(void*, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing3DTile1DWithThread(void*, size_t, size_t, size_t, + size_t, size_t) {} TEST(Parallelize3DTile1DWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread(threadpool.get(), - ComputeNothing3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - ComputeNothing3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckBounds3DTile1DWithThread(void*, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +static void CheckBounds3DTile1DWithThread(void*, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - CheckBounds3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - CheckBounds3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckBounds3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckTiling3DTile1DWithThread(void*, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k)); +static void CheckTiling3DTile1DWithThread(void*, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); } TEST(Parallelize3DTile1DWithThread, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - CheckTiling3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - CheckTiling3DTile1DWithThread, - nullptr, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), CheckTiling3DTile1DWithThread, nullptr, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void SetTrue3DTile1DWithThread(std::atomic_bool* processed_indicators, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue3DTile1DWithThread(std::atomic_bool* processed_indicators, + size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithThread, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithThread), - static_cast(indicators.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithThread), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithThread), - static_cast(indicators.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile1DWithThread(std::atomic_int* processed_counters, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithThread), + static_cast(indicators.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile1DWithThread(std::atomic_int* processed_counters, + size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithThread), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithThread), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithThread), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } -} - -TEST(Parallelize3DTile1DWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithThread), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -TEST(Parallelize3DTile1DWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithThread), - static_cast(counters.data()), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile1DWithThread(std::atomic_int* num_processed_items, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithThread), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile1DWithThread, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithThread), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile1DWithThread, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithThread), + static_cast(counters.data()), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile1DWithThread( + std::atomic_int* num_processed_items, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile1DWithThread), - static_cast(&num_processed_items), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); -} - -static void WorkImbalance3DTile1DWithThread(std::atomic_int* num_processed_items, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile1DWithThread), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); +} + +static void WorkImbalance3DTile1DWithThread( + std::atomic_int* num_processed_items, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile1DWithThread), - static_cast(&num_processed_items), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile1DWithThread), + static_cast(&num_processed_items), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void CheckThreadIndexValid3DTile1DWithThread(const size_t* num_threads, size_t thread_index, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LE(thread_index, *num_threads); +static void CheckThreadIndexValid3DTile1DWithThread(const size_t* num_threads, + size_t thread_index, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { + EXPECT_LE(thread_index, *num_threads); } TEST(Parallelize3DTile1DWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_thread( - threadpool.get(), - reinterpret_cast(CheckThreadIndexValid3DTile1DWithThread), - static_cast(&num_threads), - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_thread( + threadpool.get(), + reinterpret_cast( + CheckThreadIndexValid3DTile1DWithThread), + static_cast(&num_threads), kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void ComputeNothing3DTile1DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing3DTile1DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t) {} TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch(threadpool.get(), - ComputeNothing3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - ComputeNothing3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), ComputeNothing3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckUArch3DTile1DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch3DTile1DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckUArch3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckUArch3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckUArch3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckBounds3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +static void CheckBounds3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckBounds3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckBounds3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckBounds3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckTiling3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k)); +static void CheckTiling3DTile1DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); } TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckTiling3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - CheckTiling3DTile1DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), CheckTiling3DTile1DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void SetTrue3DTile1DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue3DTile1DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile1DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile1DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } -} - -TEST(Parallelize3DTile1DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile1DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile1DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); -} - -static void WorkImbalance3DTile1DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); +} + +static void WorkImbalance3DTile1DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3DTile1DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile1DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); + pthreadpool_parallelize_3d_tile_1d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile1DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void ComputeNothing3DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t) {} TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(threadpool.get(), - ComputeNothing3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - ComputeNothing3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), ComputeNothing3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckUArch3DTile1DWithUArchWithThread(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch3DTile1DWithUArchWithThread(void*, uint32_t uarch_index, + size_t, size_t, size_t, + size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckUArch3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckUArch3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckUArch3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckBounds3DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile1DRangeI); - EXPECT_LT(j, kParallelize3DTile1DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile1DRangeK); - EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); +static void CheckBounds3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile1DRangeI); + EXPECT_LT(j, kParallelize3DTile1DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile1DRangeK); + EXPECT_LE(start_k + tile_k, kParallelize3DTile1DRangeK); } TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckBounds3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckBounds3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckBounds3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } -static void CheckTiling3DTile1DWithUArchWithThread(void*, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile1DTileK); - EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, kParallelize3DTile1DRangeK - start_k)); +static void CheckTiling3DTile1DWithUArchWithThread(void*, uint32_t, size_t, + size_t i, size_t j, + size_t start_k, + size_t tile_k) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile1DTileK); + EXPECT_EQ(start_k % kParallelize3DTile1DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile1DTileK, + kParallelize3DTile1DRangeK - start_k)); } TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckTiling3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - CheckTiling3DTile1DWithUArchWithThread, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); -} - -static void SetTrue3DTile1DWithUArchWithThread(std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } -} - -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithUArchWithThread), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), CheckTiling3DTile1DWithUArchWithThread, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile1DRangeI, + kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, + kParallelize3DTile1DTileK, /*flags=*/0); +} + +static void SetTrue3DTile1DWithUArchWithThread( + std::atomic_bool* processed_indicators, uint32_t, size_t, size_t i, + size_t j, size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize3DTile1DWithUArchWithThread, + SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetTrue3DTile1DWithUArchWithThread), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile1DWithUArchWithThread(std::atomic_int* processed_counters, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } -} - -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } -} - -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } -} - -TEST(Parallelize3DTile1DWithUArchWithThread, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(Increment3DTile1DWithUArchWithThread), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile1DWithUArchWithThread(std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile1DWithUArchWithThread), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile1DWithUArchWithThread( + std::atomic_int* processed_counters, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize3DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile1DWithUArchWithThread, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile1DWithUArchWithThread, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile1DRangeI * + kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + Increment3DTile1DWithUArchWithThread), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile1DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile1DRangeJ + j) * kParallelize3DTile1DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile1DWithUArchWithThread), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); -} - -static void WorkImbalance3DTile1DWithUArchWithThread(std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, size_t start_k, size_t tile_k) { - num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); +} + +static void WorkImbalance3DTile1DWithUArchWithThread( + std::atomic_int* num_processed_items, uint32_t, size_t, size_t i, size_t j, + size_t start_k, size_t tile_k) { + num_processed_items->fetch_add(tile_k, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile1DWithUArchWithThread), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * kParallelize3DTile1DRangeK); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile1DWithUArchWithThread), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile1DRangeI * kParallelize3DTile1DRangeJ * + kParallelize3DTile1DRangeK); } -static void SetThreadTrue3DTile1DWithUArchWithThread(const size_t* num_threads, uint32_t, size_t thread_index, size_t i, size_t j, size_t start_k, size_t tile_k) { - EXPECT_LE(thread_index, *num_threads); +static void SetThreadTrue3DTile1DWithUArchWithThread( + const size_t* num_threads, uint32_t, size_t thread_index, size_t i, + size_t j, size_t start_k, size_t tile_k) { + EXPECT_LE(thread_index, *num_threads); } TEST(Parallelize3DTile1DWithUArchWithThread, MultiThreadPoolThreadIndexValid) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); + size_t num_threads = pthreadpool_get_threads_count(threadpool.get()); - pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( - threadpool.get(), - reinterpret_cast(SetThreadTrue3DTile1DWithUArchWithThread), - static_cast(&num_threads), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, kParallelize3DTile1DRangeK, - kParallelize3DTile1DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread( + threadpool.get(), + reinterpret_cast( + SetThreadTrue3DTile1DWithUArchWithThread), + static_cast(&num_threads), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile1DRangeI, kParallelize3DTile1DRangeJ, + kParallelize3DTile1DRangeK, kParallelize3DTile1DTileK, /*flags=*/0); } -static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, + size_t) {} TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d(threadpool.get(), - ComputeNothing3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), ComputeNothing3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - ComputeNothing3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), ComputeNothing3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } -static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile2DRangeI); - EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile2DRangeK); - EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); - EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); } TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - CheckBounds3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckBounds3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - CheckBounds3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckBounds3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } -static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j)); +static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, + kParallelize3DTile2DRangeJ - start_j)); - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile2DTileK); - EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k)); + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, + kParallelize3DTile2DRangeK - start_k)); } TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - CheckTiling3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckTiling3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - CheckTiling3DTile2D, - nullptr, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), CheckTiling3DTile2D, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); } -static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } +static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, + size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue3DTile2D), - static_cast(indicators.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile2D), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue3DTile2D), - static_cast(indicators.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue3DTile2D), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, + size_t start_j, size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment3DTile2D), - static_cast(counters.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment3DTile2D), - static_cast(counters.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment3DTile2D), - static_cast(counters.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } } TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment3DTile2D), - static_cast(counters.data()), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment3DTile2D), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(IncrementSame3DTile2D), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); +TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile2D), - static_cast(&num_processed_items), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); -} + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } -static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); - if (i == 0 && start_j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance3DTile2D), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); } -TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); +static void ComputeNothing3DTile2DDynamic(void*, size_t, size_t, size_t, size_t, + size_t) {} - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), ComputeNothing3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} - pthreadpool_parallelize_3d_tile_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile2D), - static_cast(&num_processed_items), - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); -} +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); -static void ComputeNothing3DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t) { -} + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), ComputeNothing3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void CheckBounds3DTile2DDynamic(void*, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +} + +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), CheckBounds3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), CheckBounds3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void CheckTiling3DTile2DDynamic(void*, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DRangeJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile2DRangeK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); +} + +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), CheckTiling3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), CheckTiling3DTile2DDynamic, nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void SetTrue3DTile2DDynamic(std::atomic_bool* processed_indicators, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue3DTile2DDynamic), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue3DTile2DDynamic), + static_cast(indicators.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile2DDynamic(std::atomic_int* processed_counters, + size_t i, size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment3DTile2DDynamic), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment3DTile2DDynamic), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamic), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamic), + static_cast(counters.data()), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile2DDynamic), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void WorkImbalance3DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize3DTile2DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile2DDynamic), + static_cast(&num_processed_items), kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void ComputeNothing3DTile2DDynamicWithUArch(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t) {} + +TEST(Parallelize3DTile2DDynamicWithUArch, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void CheckUArch3DTile2DDynamicWithUArch(void*, uint32_t uarch_index, + size_t, size_t, size_t, size_t, + size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, SingleThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void CheckBounds3DTile2DDynamicWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void CheckTiling3DTile2DDynamicWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_TRUE(tile_j % kParallelize3DTile2DTileJ == 0 || + tile_j == kParallelize3DTile2DRangeJ - start_j); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + + EXPECT_GT(tile_k, 0); + EXPECT_TRUE(tile_k % kParallelize3DTile2DTileK == 0 || + tile_k == kParallelize3DTile2DRangeK - start_k); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling3DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); +} + +static void SetTrue3DTile2DDynamicWithUArch( + std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, SingleThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolAllItemsProcessed) { + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile2DDynamicWithUArch( + std::atomic_int* processed_counters, uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void WorkImbalance3DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t tid, size_t i, + size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize3DTile2DDynamicWithUArch, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void ComputeNothing3DTile2DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t, size_t) {} TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d_with_uarch(threadpool.get(), - ComputeNothing3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - ComputeNothing3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } -static void CheckUArch3DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch3DTile2DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckUArch3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckUArch3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } -static void CheckBounds3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_LT(i, kParallelize3DTile2DRangeI); - EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize3DTile2DRangeK); - EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); - EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +static void CheckBounds3DTile2DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); } TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckBounds3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckBounds3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } -static void CheckTiling3DTile2DWithUArch(void*, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - EXPECT_GT(tile_j, 0); - EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); - EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); - EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j)); +static void CheckTiling3DTile2DWithUArch(void*, uint32_t, size_t i, + size_t start_j, size_t start_k, + size_t tile_j, size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min(kParallelize3DTile2DTileJ, + kParallelize3DTile2DRangeJ - start_j)); - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize3DTile2DTileK); - EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k)); + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize3DTile2DTileK, + kParallelize3DTile2DRangeK - start_k)); } TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckTiling3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling3DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), CheckTiling3DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize3DTile2DRangeI, + kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, /*flags=*/0); } -static void SetTrue3DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } +static void SetTrue3DTile2DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue3DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue3DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ") not processed"; - } - } - } -} - -static void Increment3DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue3DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile2DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } -} - -TEST(Parallelize3DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment3DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { - const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } -} - -static void IncrementSame3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - for (size_t j = start_j; j < start_j + tile_j; j++) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile2DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters(kParallelize3DTile2DRangeI * + kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment3DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = + (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void IncrementSame3DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame3DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); -} - -static void WorkImbalance3DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { - num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); - if (i == 0 && start_j == 0 && start_k == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame3DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); +} + +static void WorkImbalance3DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t start_j, + size_t start_k, size_t tile_j, + size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize3DTile2DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_3d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance3DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, - kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + pthreadpool_parallelize_3d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance3DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, + kParallelize3DTile2DRangeK, kParallelize3DTile2DTileJ, + kParallelize3DTile2DTileK, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * + kParallelize3DTile2DRangeK); } -static void ComputeNothing4D(void*, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing4D(void*, size_t, size_t, size_t, size_t) {} TEST(Parallelize4D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d(threadpool.get(), - ComputeNothing4D, - nullptr, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); + pthreadpool_parallelize_4d(threadpool.get(), ComputeNothing4D, nullptr, + kParallelize4DRangeI, kParallelize4DRangeJ, + kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); } TEST(Parallelize4D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d( - threadpool.get(), - ComputeNothing4D, - nullptr, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); + pthreadpool_parallelize_4d(threadpool.get(), ComputeNothing4D, nullptr, + kParallelize4DRangeI, kParallelize4DRangeJ, + kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); } static void CheckBounds4D(void*, size_t i, size_t j, size_t k, size_t l) { - EXPECT_LT(i, kParallelize4DRangeI); - EXPECT_LT(j, kParallelize4DRangeJ); - EXPECT_LT(k, kParallelize4DRangeK); - EXPECT_LT(l, kParallelize4DRangeL); + EXPECT_LT(i, kParallelize4DRangeI); + EXPECT_LT(j, kParallelize4DRangeJ); + EXPECT_LT(k, kParallelize4DRangeK); + EXPECT_LT(l, kParallelize4DRangeL); } TEST(Parallelize4D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d( - threadpool.get(), - CheckBounds4D, - nullptr, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); + pthreadpool_parallelize_4d(threadpool.get(), CheckBounds4D, nullptr, + kParallelize4DRangeI, kParallelize4DRangeJ, + kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); } TEST(Parallelize4D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d( - threadpool.get(), - CheckBounds4D, - nullptr, - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); + pthreadpool_parallelize_4d(threadpool.get(), CheckBounds4D, nullptr, + kParallelize4DRangeI, kParallelize4DRangeJ, + kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); } -static void SetTrue4D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue4D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize4D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(SetTrue4D), - static_cast(indicators.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(SetTrue4D), + static_cast(indicators.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(SetTrue4D), - static_cast(indicators.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } -} - -static void Increment4D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + std::vector indicators( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(SetTrue4D), + static_cast(indicators.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4D(std::atomic_int* processed_counters, size_t i, size_t j, + size_t k, size_t l) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize4D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(Increment4D), - static_cast(counters.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(Increment4D), + static_cast(counters.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(Increment4D), - static_cast(counters.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(Increment4D), + static_cast(counters.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(Increment4D), - static_cast(counters.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } + std::vector counters( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(Increment4D), + static_cast(counters.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } } TEST(Parallelize4D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(Increment4D), - static_cast(counters.data()), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * kParallelize4DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } -} - -static void IncrementSame4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + std::vector counters( + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d( + threadpool.get(), reinterpret_cast(Increment4D), + static_cast(counters.data()), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DRangeL; l++) { + const size_t linear_idx = + ((i * kParallelize4DRangeJ + j) * kParallelize4DRangeK + k) * + kParallelize4DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize4D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(IncrementSame4D), - static_cast(&num_processed_items), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); -} - -static void WorkImbalance4D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d( + threadpool.get(), + reinterpret_cast(IncrementSame4D), + static_cast(&num_processed_items), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); +} + +static void WorkImbalance4D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize4D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d( - threadpool.get(), - reinterpret_cast(WorkImbalance4D), - static_cast(&num_processed_items), - kParallelize4DRangeI, kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * kParallelize4DRangeL); + pthreadpool_parallelize_4d( + threadpool.get(), + reinterpret_cast(WorkImbalance4D), + static_cast(&num_processed_items), kParallelize4DRangeI, + kParallelize4DRangeJ, kParallelize4DRangeK, kParallelize4DRangeL, + /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DRangeI * kParallelize4DRangeJ * kParallelize4DRangeK * + kParallelize4DRangeL); } -static void ComputeNothing4DTile1D(void*, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing4DTile1D(void*, size_t, size_t, size_t, size_t, + size_t) {} TEST(Parallelize4DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_1d(threadpool.get(), - ComputeNothing4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), ComputeNothing4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } TEST(Parallelize4DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - ComputeNothing4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), ComputeNothing4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } -static void CheckBounds4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - EXPECT_LT(i, kParallelize4DTile1DRangeI); - EXPECT_LT(j, kParallelize4DTile1DRangeJ); - EXPECT_LT(k, kParallelize4DTile1DRangeK); - EXPECT_LT(start_l, kParallelize4DTile1DRangeL); - EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL); +static void CheckBounds4DTile1D(void*, size_t i, size_t j, size_t k, + size_t start_l, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile1DRangeI); + EXPECT_LT(j, kParallelize4DTile1DRangeJ); + EXPECT_LT(k, kParallelize4DTile1DRangeK); + EXPECT_LT(start_l, kParallelize4DTile1DRangeL); + EXPECT_LE(start_l + tile_l, kParallelize4DTile1DRangeL); } TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - CheckBounds4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), CheckBounds4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - CheckBounds4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), CheckBounds4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } -static void CheckTiling4DTile1D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize4DTile1DTileL); - EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize4DTile1DTileL, kParallelize4DTile1DRangeL - start_l)); +static void CheckTiling4DTile1D(void*, size_t i, size_t j, size_t k, + size_t start_l, size_t tile_l) { + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile1DTileL); + EXPECT_EQ(start_l % kParallelize4DTile1DTileL, 0); + EXPECT_EQ(tile_l, std::min(kParallelize4DTile1DTileL, + kParallelize4DTile1DRangeL - start_l)); } TEST(Parallelize4DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - CheckTiling4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), CheckTiling4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } TEST(Parallelize4DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - CheckTiling4DTile1D, - nullptr, - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), CheckTiling4DTile1D, nullptr, + kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, + kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, + kParallelize4DTile1DTileL, /*flags=*/0); } -static void SetTrue4DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } +static void SetTrue4DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t start_l, size_t tile_l) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize4DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue4DTile1D), - static_cast(indicators.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue4DTile1D), + static_cast(indicators.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue4DTile1D), - static_cast(indicators.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } -} - -static void Increment4DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue4DTile1D), + static_cast(indicators.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4DTile1D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t k, size_t start_l, + size_t tile_l) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment4DTile1D), - static_cast(counters.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment4DTile1D), + static_cast(counters.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment4DTile1D), - static_cast(counters.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment4DTile1D), + static_cast(counters.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment4DTile1D), - static_cast(counters.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } + std::vector counters( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment4DTile1D), + static_cast(counters.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } } TEST(Parallelize4DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment4DTile1D), - static_cast(counters.data()), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * kParallelize4DTile1DRangeK + k) * kParallelize4DTile1DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } -} - -static void IncrementSame4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters( + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment4DTile1D), + static_cast(counters.data()), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile1DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile1DRangeJ + j) * + kParallelize4DTile1DRangeK + + k) * + kParallelize4DTile1DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t start_l, + size_t tile_l) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize4DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame4DTile1D), - static_cast(&num_processed_items), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); -} - -static void WorkImbalance4DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t tile_l) { - num_processed_items->fetch_add(tile_l, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && start_l == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame4DTile1D), + static_cast(&num_processed_items), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); +} + +static void WorkImbalance4DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t start_l, + size_t tile_l) { + num_processed_items->fetch_add(tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && start_l == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize4DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance4DTile1D), - static_cast(&num_processed_items), - kParallelize4DTile1DRangeI, kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, kParallelize4DTile1DRangeL, - kParallelize4DTile1DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); + pthreadpool_parallelize_4d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance4DTile1D), + static_cast(&num_processed_items), kParallelize4DTile1DRangeI, + kParallelize4DTile1DRangeJ, kParallelize4DTile1DRangeK, + kParallelize4DTile1DRangeL, kParallelize4DTile1DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile1DRangeI * kParallelize4DTile1DRangeJ * + kParallelize4DTile1DRangeK * kParallelize4DTile1DRangeL); } -static void ComputeNothing4DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing4DTile2D(void*, size_t, size_t, size_t, size_t, + size_t, size_t) {} TEST(Parallelize4DTile2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d(threadpool.get(), - ComputeNothing4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), ComputeNothing4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - ComputeNothing4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), ComputeNothing4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); } -static void CheckBounds4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_LT(i, kParallelize4DTile2DRangeI); - EXPECT_LT(j, kParallelize4DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize4DTile2DRangeK); - EXPECT_LT(start_l, kParallelize4DTile2DRangeL); - EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); - EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); +static void CheckBounds4DTile2D(void*, size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); } TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - CheckBounds4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), CheckBounds4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - CheckBounds4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), CheckBounds4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); } -static void CheckTiling4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize4DTile2DTileK); - EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k)); +static void CheckTiling4DTile2D(void*, size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize4DTile2DTileK); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize4DTile2DTileK, + kParallelize4DTile2DRangeK - start_k)); - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize4DTile2DTileL); - EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l)); + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile2DTileL); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); + EXPECT_EQ(tile_l, std::min(kParallelize4DTile2DTileL, + kParallelize4DTile2DRangeL - start_l)); } TEST(Parallelize4DTile2D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - CheckTiling4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), CheckTiling4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - CheckTiling4DTile2D, - nullptr, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); -} - -static void SetTrue4DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), CheckTiling4DTile2D, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void SetTrue4DTile2D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue4DTile2D), - static_cast(indicators.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue4DTile2D), + static_cast(indicators.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue4DTile2D), - static_cast(indicators.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } -} - -static void Increment4DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue4DTile2D), + static_cast(indicators.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4DTile2D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment4DTile2D), - static_cast(counters.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment4DTile2D), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment4DTile2D), - static_cast(counters.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment4DTile2D), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment4DTile2D), - static_cast(counters.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment4DTile2D), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } } TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment4DTile2D), - static_cast(counters.data()), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } -} - -static void IncrementSame4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment4DTile2D), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(IncrementSame4DTile2D), - static_cast(&num_processed_items), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); -} - -static void WorkImbalance4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(IncrementSame4DTile2D), + static_cast(&num_processed_items), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void WorkImbalance4DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize4DTile2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance4DTile2D), - static_cast(&num_processed_items), - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance4DTile2D), + static_cast(&num_processed_items), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); } -static void ComputeNothing4DTile2DWithUArch(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing4DTile2DWithUArch(void*, uint32_t, size_t, size_t, + size_t, size_t, size_t, size_t) {} TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d_with_uarch(threadpool.get(), - ComputeNothing4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - ComputeNothing4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), ComputeNothing4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } -static void CheckUArch4DTile2DWithUArch(void*, uint32_t uarch_index, size_t, size_t, size_t, size_t, size_t, size_t) { - if (uarch_index != kDefaultUArchIndex) { - EXPECT_LE(uarch_index, kMaxUArchIndex); - } +static void CheckUArch4DTile2DWithUArch(void*, uint32_t uarch_index, size_t, + size_t, size_t, size_t, size_t, + size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } } TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckUArch4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUArchInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckUArch4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckUArch4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } -static void CheckBounds4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_LT(i, kParallelize4DTile2DRangeI); - EXPECT_LT(j, kParallelize4DTile2DRangeJ); - EXPECT_LT(start_k, kParallelize4DTile2DRangeK); - EXPECT_LT(start_l, kParallelize4DTile2DRangeL); - EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); - EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); +static void CheckBounds4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); } TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckBounds4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckBounds4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckBounds4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } -static void CheckTiling4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - EXPECT_GT(tile_k, 0); - EXPECT_LE(tile_k, kParallelize4DTile2DTileK); - EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); - EXPECT_EQ(tile_k, std::min(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k)); +static void CheckTiling4DTile2DWithUArch(void*, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize4DTile2DTileK); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min(kParallelize4DTile2DTileK, + kParallelize4DTile2DRangeK - start_k)); - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize4DTile2DTileL); - EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l)); + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile2DTileL); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); + EXPECT_EQ(tile_l, std::min(kParallelize4DTile2DTileL, + kParallelize4DTile2DRangeL - start_l)); } TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckTiling4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - CheckTiling4DTile2DWithUArch, - nullptr, - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); -} - -static void SetTrue4DTile2DWithUArch(std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), CheckTiling4DTile2DWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void SetTrue4DTile2DWithUArch(std::atomic_bool* processed_indicators, + uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue4DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue4DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(SetTrue4DTile2DWithUArch), - static_cast(indicators.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; - } - } - } - } -} - -static void Increment4DTile2DWithUArch(std::atomic_int* processed_counters, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue4DTile2DWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4DTile2DWithUArch(std::atomic_int* processed_counters, + uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment4DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment4DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } -} - -TEST(Parallelize4DTile2DWithUArch, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment4DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } -} - -TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(Increment4DTile2DWithUArch), - static_cast(counters.data()), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { - const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations << ")"; - } - } - } - } -} - -static void IncrementSame4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - for (size_t k = start_k; k < start_k + tile_k; k++) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +TEST(Parallelize4DTile2DWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(IncrementSame4DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); -} - -static void WorkImbalance4DTile2DWithUArch(std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { - num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); - if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame4DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void WorkImbalance4DTile2DWithUArch(std::atomic_int* num_processed_items, + uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize4DTile2DWithUArch, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_4d_tile_2d_with_uarch( - threadpool.get(), - reinterpret_cast(WorkImbalance4DTile2DWithUArch), - static_cast(&num_processed_items), - kDefaultUArchIndex, kMaxUArchIndex, - kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, - kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + pthreadpool_parallelize_4d_tile_2d_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance4DTile2DWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); } -static void ComputeNothing5D(void*, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing4DTile2DDynamic(void*, size_t, size_t, size_t, size_t, + size_t, size_t) {} + +TEST(Parallelize4DTile2DDynamic, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), ComputeNothing4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), ComputeNothing4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void CheckBounds4DTile2DDynamic(void*, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); +} + +TEST(Parallelize4DTile2DDynamic, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), CheckBounds4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), CheckBounds4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void CheckTiling4DTile2DDynamic(void*, size_t i, size_t j, + size_t start_k, size_t start_l, + size_t tile_k, size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + + EXPECT_GT(tile_l, 0); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); +} + +TEST(Parallelize4DTile2DDynamic, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), CheckTiling4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), CheckTiling4DTile2DDynamic, nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void SetTrue4DTile2DDynamic(std::atomic_bool* processed_indicators, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamic, SingleThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue4DTile2DDynamic), + static_cast(indicators.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(SetTrue4DTile2DDynamic), + static_cast(indicators.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4DTile2DDynamic(std::atomic_int* processed_counters, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamic, SingleThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment4DTile2DDynamic), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast(Increment4DTile2DDynamic), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamic), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamic, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamic), + static_cast(counters.data()), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + IncrementSame4DTile2DDynamic), + static_cast(&num_processed_items), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void WorkImbalance4DTile2DDynamic(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize4DTile2DDynamic, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic( + threadpool.get(), + reinterpret_cast( + WorkImbalance4DTile2DDynamic), + static_cast(&num_processed_items), kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void ComputeNothing4DTile2DDynamicWithUArch(void*, uint32_t, size_t, + size_t, size_t, size_t, + size_t, size_t) {} + +TEST(Parallelize4DTile2DDynamicWithUArch, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), ComputeNothing4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void CheckUArch4DTile2DDynamicWithUArch(void*, uint32_t uarch_index, + size_t, size_t, size_t, size_t, + size_t, size_t) { + if (uarch_index != kDefaultUArchIndex) { + EXPECT_LE(uarch_index, kMaxUArchIndex); + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, SingleThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolUArchInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckUArch4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void CheckBounds4DTile2DDynamicWithUArch(void*, uint32_t, size_t i, + size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckBounds4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void CheckTiling4DTile2DDynamicWithUArch(void*, uint32_t, size_t i, + size_t j, size_t start_k, + size_t start_l, size_t tile_k, + size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + + EXPECT_GT(tile_l, 0); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), CheckTiling4DTile2DDynamicWithUArch, nullptr, + kDefaultUArchIndex, kMaxUArchIndex, kParallelize4DTile2DRangeI, + kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, + kParallelize4DTile2DRangeL, kParallelize4DTile2DTileK, + kParallelize4DTile2DTileL, /*flags=*/0); +} + +static void SetTrue4DTile2DDynamicWithUArch( + std::atomic_bool* processed_indicators, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, SingleThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue4DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolAllItemsProcessed) { + std::vector indicators( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + SetTrue4DTile2DDynamicWithUArch), + static_cast(indicators.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") not processed"; + } + } + } + } +} + +static void Increment4DTile2DDynamicWithUArch( + std::atomic_int* processed_counters, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = + ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedOnce) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, + SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, + MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector counters( + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + Increment4DTile2DDynamicWithUArch), + static_cast(counters.data()), kDefaultUArchIndex, kMaxUArchIndex, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * + kParallelize4DTile2DRangeK + + k) * + kParallelize4DTile2DRangeL + + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void IncrementSame4DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolHighContention) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + IncrementSame4DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void WorkImbalance4DTile2DDynamicWithUArch( + std::atomic_int* num_processed_items, uint32_t, size_t i, size_t j, + size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { + /* Sleep for a second. This differs from the non-dynamic `WorkImbalance*` + * strategies in that a thread may reserve more elements than fit into a + * single work function call. Blocking a single work function call will also + * block any potentially remaining elements allocated to that thread. We + * therefore just sleep for a second instead.*/ + std::this_thread::sleep_for(std::chrono::seconds(1)); + } +} + +TEST(Parallelize4DTile2DDynamicWithUArch, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d_dynamic_with_uarch( + threadpool.get(), + reinterpret_cast( + WorkImbalance4DTile2DDynamicWithUArch), + static_cast(&num_processed_items), kDefaultUArchIndex, + kMaxUArchIndex, kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, + kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * + kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void ComputeNothing5D(void*, size_t, size_t, size_t, size_t, size_t) {} TEST(Parallelize5D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d(threadpool.get(), - ComputeNothing5D, - nullptr, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); + pthreadpool_parallelize_5d(threadpool.get(), ComputeNothing5D, nullptr, + kParallelize5DRangeI, kParallelize5DRangeJ, + kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); } TEST(Parallelize5D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d( - threadpool.get(), - ComputeNothing5D, - nullptr, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); + pthreadpool_parallelize_5d(threadpool.get(), ComputeNothing5D, nullptr, + kParallelize5DRangeI, kParallelize5DRangeJ, + kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); } -static void CheckBounds5D(void*, size_t i, size_t j, size_t k, size_t l, size_t m) { - EXPECT_LT(i, kParallelize5DRangeI); - EXPECT_LT(j, kParallelize5DRangeJ); - EXPECT_LT(k, kParallelize5DRangeK); - EXPECT_LT(l, kParallelize5DRangeL); - EXPECT_LT(m, kParallelize5DRangeM); +static void CheckBounds5D(void*, size_t i, size_t j, size_t k, size_t l, + size_t m) { + EXPECT_LT(i, kParallelize5DRangeI); + EXPECT_LT(j, kParallelize5DRangeJ); + EXPECT_LT(k, kParallelize5DRangeK); + EXPECT_LT(l, kParallelize5DRangeL); + EXPECT_LT(m, kParallelize5DRangeM); } TEST(Parallelize5D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d( - threadpool.get(), - CheckBounds5D, - nullptr, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); + pthreadpool_parallelize_5d(threadpool.get(), CheckBounds5D, nullptr, + kParallelize5DRangeI, kParallelize5DRangeJ, + kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); } TEST(Parallelize5D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d( - threadpool.get(), - CheckBounds5D, - nullptr, - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); + pthreadpool_parallelize_5d(threadpool.get(), CheckBounds5D, nullptr, + kParallelize5DRangeI, kParallelize5DRangeJ, + kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); } -static void SetTrue5D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue5D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l, size_t m) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize5D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(SetTrue5D), - static_cast(indicators.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(SetTrue5D), + static_cast(indicators.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(SetTrue5D), - static_cast(indicators.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } -} - -static void Increment5D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + std::vector indicators( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(SetTrue5D), + static_cast(indicators.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } +} + +static void Increment5D(std::atomic_int* processed_counters, size_t i, size_t j, + size_t k, size_t l, size_t m) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize5D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(Increment5D), - static_cast(counters.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(Increment5D), + static_cast(counters.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(Increment5D), - static_cast(counters.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(Increment5D), + static_cast(counters.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(Increment5D), - static_cast(counters.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } + std::vector counters( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(Increment5D), + static_cast(counters.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } } TEST(Parallelize5D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(Increment5D), - static_cast(counters.data()), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * kParallelize5DRangeL + l) * kParallelize5DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } -} - -static void IncrementSame5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + std::vector counters( + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d( + threadpool.get(), reinterpret_cast(Increment5D), + static_cast(counters.data()), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DRangeM; m++) { + const size_t linear_idx = + (((i * kParallelize5DRangeJ + j) * kParallelize5DRangeK + k) * + kParallelize5DRangeL + + l) * + kParallelize5DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } +} + +static void IncrementSame5D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l, size_t m) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize5D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(IncrementSame5D), - static_cast(&num_processed_items), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); -} - -static void WorkImbalance5D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d( + threadpool.get(), + reinterpret_cast(IncrementSame5D), + static_cast(&num_processed_items), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); +} + +static void WorkImbalance5D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l, size_t m) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize5D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d( - threadpool.get(), - reinterpret_cast(WorkImbalance5D), - static_cast(&num_processed_items), - kParallelize5DRangeI, kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, kParallelize5DRangeM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * kParallelize5DRangeL * kParallelize5DRangeM); + pthreadpool_parallelize_5d( + threadpool.get(), + reinterpret_cast(WorkImbalance5D), + static_cast(&num_processed_items), kParallelize5DRangeI, + kParallelize5DRangeJ, kParallelize5DRangeK, kParallelize5DRangeL, + kParallelize5DRangeM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DRangeI * kParallelize5DRangeJ * kParallelize5DRangeK * + kParallelize5DRangeL * kParallelize5DRangeM); } -static void ComputeNothing5DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing5DTile1D(void*, size_t, size_t, size_t, size_t, + size_t, size_t) {} TEST(Parallelize5DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_1d(threadpool.get(), - ComputeNothing5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), ComputeNothing5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); } TEST(Parallelize5DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - ComputeNothing5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), ComputeNothing5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); } -static void CheckBounds5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - EXPECT_LT(i, kParallelize5DTile1DRangeI); - EXPECT_LT(j, kParallelize5DTile1DRangeJ); - EXPECT_LT(k, kParallelize5DTile1DRangeK); - EXPECT_LT(l, kParallelize5DTile1DRangeL); - EXPECT_LT(start_m, kParallelize5DTile1DRangeM); - EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM); +static void CheckBounds5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t tile_m) { + EXPECT_LT(i, kParallelize5DTile1DRangeI); + EXPECT_LT(j, kParallelize5DTile1DRangeJ); + EXPECT_LT(k, kParallelize5DTile1DRangeK); + EXPECT_LT(l, kParallelize5DTile1DRangeL); + EXPECT_LT(start_m, kParallelize5DTile1DRangeM); + EXPECT_LE(start_m + tile_m, kParallelize5DTile1DRangeM); } TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - CheckBounds5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), CheckBounds5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); } TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - CheckBounds5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), CheckBounds5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); } -static void CheckTiling5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize5DTile1DTileM); - EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize5DTile1DTileM, kParallelize5DTile1DRangeM - start_m)); +static void CheckTiling5DTile1D(void*, size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t tile_m) { + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize5DTile1DTileM); + EXPECT_EQ(start_m % kParallelize5DTile1DTileM, 0); + EXPECT_EQ(tile_m, std::min(kParallelize5DTile1DTileM, + kParallelize5DTile1DRangeM - start_m)); } TEST(Parallelize5DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - CheckTiling5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), CheckTiling5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); } TEST(Parallelize5DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - CheckTiling5DTile1D, - nullptr, - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); -} - -static void SetTrue5DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), CheckTiling5DTile1D, nullptr, + kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, + kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, + kParallelize5DTile1DRangeM, kParallelize5DTile1DTileM, /*flags=*/0); +} + +static void SetTrue5DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = + (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize5DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue5DTile1D), - static_cast(indicators.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue5DTile1D), + static_cast(indicators.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue5DTile1D), - static_cast(indicators.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } -} - -static void Increment5DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue5DTile1D), + static_cast(indicators.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } +} + +static void Increment5DTile1D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t k, size_t l, size_t start_m, + size_t tile_m) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = + (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment5DTile1D), - static_cast(counters.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment5DTile1D), + static_cast(counters.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment5DTile1D), - static_cast(counters.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment5DTile1D), + static_cast(counters.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment5DTile1D), - static_cast(counters.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment5DTile1D), + static_cast(counters.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } } TEST(Parallelize5DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment5DTile1D), - static_cast(counters.data()), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * kParallelize5DTile1DRangeK + k) * kParallelize5DTile1DRangeL + l) * kParallelize5DTile1DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } -} - -static void IncrementSame5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters( + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment5DTile1D), + static_cast(counters.data()), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile1DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile1DRangeJ + j) * + kParallelize5DTile1DRangeK + + k) * + kParallelize5DTile1DRangeL + + l) * + kParallelize5DTile1DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } +} + +static void IncrementSame5DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t tile_m) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize5DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame5DTile1D), - static_cast(&num_processed_items), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); -} - -static void WorkImbalance5DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t tile_m) { - num_processed_items->fetch_add(tile_m, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame5DTile1D), + static_cast(&num_processed_items), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); +} + +static void WorkImbalance5DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t tile_m) { + num_processed_items->fetch_add(tile_m, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize5DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance5DTile1D), - static_cast(&num_processed_items), - kParallelize5DTile1DRangeI, kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, - kParallelize5DTile1DTileM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * kParallelize5DTile1DRangeM); + pthreadpool_parallelize_5d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance5DTile1D), + static_cast(&num_processed_items), kParallelize5DTile1DRangeI, + kParallelize5DTile1DRangeJ, kParallelize5DTile1DRangeK, + kParallelize5DTile1DRangeL, kParallelize5DTile1DRangeM, + kParallelize5DTile1DTileM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DTile1DRangeI * kParallelize5DTile1DRangeJ * + kParallelize5DTile1DRangeK * kParallelize5DTile1DRangeL * + kParallelize5DTile1DRangeM); } -static void ComputeNothing5DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing5DTile2D(void*, size_t, size_t, size_t, size_t, + size_t, size_t, size_t) {} TEST(Parallelize5DTile2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_2d(threadpool.get(), - ComputeNothing5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), ComputeNothing5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); } TEST(Parallelize5DTile2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - ComputeNothing5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), ComputeNothing5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); } -static void CheckBounds5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - EXPECT_LT(i, kParallelize5DTile2DRangeI); - EXPECT_LT(j, kParallelize5DTile2DRangeJ); - EXPECT_LT(k, kParallelize5DTile2DRangeK); - EXPECT_LT(start_l, kParallelize5DTile2DRangeL); - EXPECT_LT(start_m, kParallelize5DTile2DRangeM); - EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL); - EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM); +static void CheckBounds5DTile2D(void*, size_t i, size_t j, size_t k, + size_t start_l, size_t start_m, size_t tile_l, + size_t tile_m) { + EXPECT_LT(i, kParallelize5DTile2DRangeI); + EXPECT_LT(j, kParallelize5DTile2DRangeJ); + EXPECT_LT(k, kParallelize5DTile2DRangeK); + EXPECT_LT(start_l, kParallelize5DTile2DRangeL); + EXPECT_LT(start_m, kParallelize5DTile2DRangeM); + EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL); + EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM); } TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - CheckBounds5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), CheckBounds5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); } TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - CheckBounds5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), CheckBounds5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); } -static void CheckTiling5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - EXPECT_GT(tile_l, 0); - EXPECT_LE(tile_l, kParallelize5DTile2DTileL); - EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0); - EXPECT_EQ(tile_l, std::min(kParallelize5DTile2DTileL, kParallelize5DTile2DRangeL - start_l)); +static void CheckTiling5DTile2D(void*, size_t i, size_t j, size_t k, + size_t start_l, size_t start_m, size_t tile_l, + size_t tile_m) { + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize5DTile2DTileL); + EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0); + EXPECT_EQ(tile_l, std::min(kParallelize5DTile2DTileL, + kParallelize5DTile2DRangeL - start_l)); - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize5DTile2DTileM); - EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize5DTile2DTileM, kParallelize5DTile2DRangeM - start_m)); + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize5DTile2DTileM); + EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0); + EXPECT_EQ(tile_m, std::min(kParallelize5DTile2DTileM, + kParallelize5DTile2DRangeM - start_m)); } TEST(Parallelize5DTile2D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - CheckTiling5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), CheckTiling5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); } TEST(Parallelize5DTile2D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - CheckTiling5DTile2D, - nullptr, - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); -} - -static void SetTrue5DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), CheckTiling5DTile2D, nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, + kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, + kParallelize5DTile2DRangeM, kParallelize5DTile2DTileL, + kParallelize5DTile2DTileM, /*flags=*/0); +} + +static void SetTrue5DTile2D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t start_l, size_t start_m, + size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = + (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue5DTile2D), - static_cast(indicators.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } + std::vector indicators( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue5DTile2D), + static_cast(indicators.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } } TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue5DTile2D), - static_cast(indicators.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; - } - } - } - } - } -} - -static void Increment5DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue5DTile2D), + static_cast(indicators.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") not processed"; + } + } + } + } + } +} + +static void Increment5DTile2D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t k, size_t start_l, + size_t start_m, size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = + (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment5DTile2D), - static_cast(counters.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment5DTile2D), + static_cast(counters.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment5DTile2D), - static_cast(counters.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment5DTile2D), + static_cast(counters.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } } TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment5DTile2D), - static_cast(counters.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } + std::vector counters( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment5DTile2D), + static_cast(counters.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } } TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment5DTile2D), - static_cast(counters.data()), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { - const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations5D << ")"; - } - } - } - } - } -} - -static void IncrementSame5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - for (size_t l = start_l; l < start_l + tile_l; l++) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters( + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment5DTile2D), + static_cast(counters.data()), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * + kParallelize5DTile2DRangeK + + k) * + kParallelize5DTile2DRangeL + + l) * + kParallelize5DTile2DRangeM + + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } +} + +static void IncrementSame5DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t start_l, + size_t start_m, size_t tile_l, + size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize5DTile2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(IncrementSame5DTile2D), - static_cast(&num_processed_items), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); -} - -static void WorkImbalance5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { - num_processed_items->fetch_add(tile_l * tile_m, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && start_l == 0 && start_m == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(IncrementSame5DTile2D), + static_cast(&num_processed_items), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); +} + +static void WorkImbalance5DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t start_l, + size_t start_m, size_t tile_l, + size_t tile_m) { + num_processed_items->fetch_add(tile_l * tile_m, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && start_l == 0 && start_m == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize5DTile2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_5d_tile_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance5DTile2D), - static_cast(&num_processed_items), - kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, - kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance5DTile2D), + static_cast(&num_processed_items), kParallelize5DTile2DRangeI, + kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, + kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * + kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * + kParallelize5DTile2DRangeM); } -static void ComputeNothing6D(void*, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing6D(void*, size_t, size_t, size_t, size_t, size_t, + size_t) {} TEST(Parallelize6D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d(threadpool.get(), - ComputeNothing6D, - nullptr, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); + pthreadpool_parallelize_6d( + threadpool.get(), ComputeNothing6D, nullptr, kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); } TEST(Parallelize6D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d( - threadpool.get(), - ComputeNothing6D, - nullptr, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); + pthreadpool_parallelize_6d( + threadpool.get(), ComputeNothing6D, nullptr, kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); } -static void CheckBounds6D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - EXPECT_LT(i, kParallelize6DRangeI); - EXPECT_LT(j, kParallelize6DRangeJ); - EXPECT_LT(k, kParallelize6DRangeK); - EXPECT_LT(l, kParallelize6DRangeL); - EXPECT_LT(m, kParallelize6DRangeM); - EXPECT_LT(n, kParallelize6DRangeN); +static void CheckBounds6D(void*, size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n) { + EXPECT_LT(i, kParallelize6DRangeI); + EXPECT_LT(j, kParallelize6DRangeJ); + EXPECT_LT(k, kParallelize6DRangeK); + EXPECT_LT(l, kParallelize6DRangeL); + EXPECT_LT(m, kParallelize6DRangeM); + EXPECT_LT(n, kParallelize6DRangeN); } TEST(Parallelize6D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d( - threadpool.get(), - CheckBounds6D, - nullptr, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); + pthreadpool_parallelize_6d( + threadpool.get(), CheckBounds6D, nullptr, kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); } TEST(Parallelize6D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d( - threadpool.get(), - CheckBounds6D, - nullptr, - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); + pthreadpool_parallelize_6d( + threadpool.get(), CheckBounds6D, nullptr, kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); } -static void SetTrue6D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +static void SetTrue6D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l, size_t m, size_t n) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); } TEST(Parallelize6D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(SetTrue6D), - static_cast(indicators.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(SetTrue6D), + static_cast(indicators.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(SetTrue6D), - static_cast(indicators.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } -} - -static void Increment6D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + std::vector indicators( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(SetTrue6D), + static_cast(indicators.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } +} + +static void Increment6D(std::atomic_int* processed_counters, size_t i, size_t j, + size_t k, size_t l, size_t m, size_t n) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize6D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(Increment6D), - static_cast(counters.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(Increment6D), + static_cast(counters.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(Increment6D), - static_cast(counters.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(Increment6D), + static_cast(counters.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(Increment6D), - static_cast(counters.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(Increment6D), + static_cast(counters.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } } TEST(Parallelize6D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(Increment6D), - static_cast(counters.data()), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + k) * kParallelize6DRangeL + l) * kParallelize6DRangeM + m) * kParallelize6DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } -} - -static void IncrementSame6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); + std::vector counters( + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d( + threadpool.get(), reinterpret_cast(Increment6D), + static_cast(counters.data()), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DRangeJ + j) * kParallelize6DRangeK + + k) * + kParallelize6DRangeL + + l) * + kParallelize6DRangeM + + m) * + kParallelize6DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } +} + +static void IncrementSame6D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l, size_t m, size_t n) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); } TEST(Parallelize6D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(IncrementSame6D), - static_cast(&num_processed_items), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); -} - -static void WorkImbalance6D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && n == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d( + threadpool.get(), + reinterpret_cast(IncrementSame6D), + static_cast(&num_processed_items), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * + kParallelize6DRangeN); +} + +static void WorkImbalance6D(std::atomic_int* num_processed_items, size_t i, + size_t j, size_t k, size_t l, size_t m, size_t n) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && n == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * + kParallelize6DRangeN) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize6D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d( - threadpool.get(), - reinterpret_cast(WorkImbalance6D), - static_cast(&num_processed_items), - kParallelize6DRangeI, kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, kParallelize6DRangeM, kParallelize6DRangeN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * kParallelize6DRangeL * kParallelize6DRangeM * kParallelize6DRangeN); + pthreadpool_parallelize_6d( + threadpool.get(), + reinterpret_cast(WorkImbalance6D), + static_cast(&num_processed_items), kParallelize6DRangeI, + kParallelize6DRangeJ, kParallelize6DRangeK, kParallelize6DRangeL, + kParallelize6DRangeM, kParallelize6DRangeN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DRangeI * kParallelize6DRangeJ * kParallelize6DRangeK * + kParallelize6DRangeL * kParallelize6DRangeM * + kParallelize6DRangeN); } -static void ComputeNothing6DTile1D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing6DTile1D(void*, size_t, size_t, size_t, size_t, + size_t, size_t, size_t) {} TEST(Parallelize6DTile1D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_1d(threadpool.get(), - ComputeNothing6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), ComputeNothing6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); } TEST(Parallelize6DTile1D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - ComputeNothing6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), ComputeNothing6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); } -static void CheckBounds6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - EXPECT_LT(i, kParallelize6DTile1DRangeI); - EXPECT_LT(j, kParallelize6DTile1DRangeJ); - EXPECT_LT(k, kParallelize6DTile1DRangeK); - EXPECT_LT(l, kParallelize6DTile1DRangeL); - EXPECT_LT(m, kParallelize6DTile1DRangeM); - EXPECT_LT(start_n, kParallelize6DTile1DRangeN); - EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN); +static void CheckBounds6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, + size_t m, size_t start_n, size_t tile_n) { + EXPECT_LT(i, kParallelize6DTile1DRangeI); + EXPECT_LT(j, kParallelize6DTile1DRangeJ); + EXPECT_LT(k, kParallelize6DTile1DRangeK); + EXPECT_LT(l, kParallelize6DTile1DRangeL); + EXPECT_LT(m, kParallelize6DTile1DRangeM); + EXPECT_LT(start_n, kParallelize6DTile1DRangeN); + EXPECT_LE(start_n + tile_n, kParallelize6DTile1DRangeN); } TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - CheckBounds6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), CheckBounds6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); } TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - CheckBounds6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), CheckBounds6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); } -static void CheckTiling6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - EXPECT_GT(tile_n, 0); - EXPECT_LE(tile_n, kParallelize6DTile1DTileN); - EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0); - EXPECT_EQ(tile_n, std::min(kParallelize6DTile1DTileN, kParallelize6DTile1DRangeN - start_n)); +static void CheckTiling6DTile1D(void*, size_t i, size_t j, size_t k, size_t l, + size_t m, size_t start_n, size_t tile_n) { + EXPECT_GT(tile_n, 0); + EXPECT_LE(tile_n, kParallelize6DTile1DTileN); + EXPECT_EQ(start_n % kParallelize6DTile1DTileN, 0); + EXPECT_EQ(tile_n, std::min(kParallelize6DTile1DTileN, + kParallelize6DTile1DRangeN - start_n)); } TEST(Parallelize6DTile1D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - CheckTiling6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), CheckTiling6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); } TEST(Parallelize6DTile1D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - CheckTiling6DTile1D, - nullptr, - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); -} - -static void SetTrue6DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), CheckTiling6DTile1D, nullptr, + kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, + kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, + kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, + kParallelize6DTile1DTileN, /*flags=*/0); +} + +static void SetTrue6DTile1D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l, size_t m, + size_t start_n, size_t tile_n) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } } TEST(Parallelize6DTile1D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue6DTile1D), - static_cast(indicators.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue6DTile1D), + static_cast(indicators.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(SetTrue6DTile1D), - static_cast(indicators.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } -} - -static void Increment6DTile1D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } + std::vector indicators( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(SetTrue6DTile1D), + static_cast(indicators.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } +} + +static void Increment6DTile1D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t k, size_t l, size_t m, + size_t start_n, size_t tile_n) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment6DTile1D), - static_cast(counters.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment6DTile1D), + static_cast(counters.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment6DTile1D), - static_cast(counters.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment6DTile1D), + static_cast(counters.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment6DTile1D), - static_cast(counters.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment6DTile1D), + static_cast(counters.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } } TEST(Parallelize6DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(Increment6DTile1D), - static_cast(counters.data()), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile1DRangeJ + j) * kParallelize6DTile1DRangeK + k) * kParallelize6DTile1DRangeL + l) * kParallelize6DTile1DRangeM + m) * kParallelize6DTile1DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } -} - -static void IncrementSame6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } + std::vector counters( + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(Increment6DTile1D), + static_cast(counters.data()), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile1DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile1DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile1DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile1DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile1DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile1DRangeJ + j) * + kParallelize6DTile1DRangeK + + k) * + kParallelize6DTile1DRangeL + + l) * + kParallelize6DTile1DRangeM + + m) * + kParallelize6DTile1DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } +} + +static void IncrementSame6DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t m, size_t start_n, size_t tile_n) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } } TEST(Parallelize6DTile1D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(IncrementSame6DTile1D), - static_cast(&num_processed_items), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); -} - -static void WorkImbalance6DTile1D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t m, size_t start_n, size_t tile_n) { - num_processed_items->fetch_add(tile_n, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && start_n == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(IncrementSame6DTile1D), + static_cast(&num_processed_items), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); +} + +static void WorkImbalance6DTile1D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t m, size_t start_n, size_t tile_n) { + num_processed_items->fetch_add(tile_n, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && m == 0 && start_n == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize6DTile1D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d_tile_1d( - threadpool.get(), - reinterpret_cast(WorkImbalance6DTile1D), - static_cast(&num_processed_items), - kParallelize6DTile1DRangeI, kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, kParallelize6DTile1DRangeN, - kParallelize6DTile1DTileN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); + pthreadpool_parallelize_6d_tile_1d( + threadpool.get(), + reinterpret_cast(WorkImbalance6DTile1D), + static_cast(&num_processed_items), kParallelize6DTile1DRangeI, + kParallelize6DTile1DRangeJ, kParallelize6DTile1DRangeK, + kParallelize6DTile1DRangeL, kParallelize6DTile1DRangeM, + kParallelize6DTile1DRangeN, kParallelize6DTile1DTileN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DTile1DRangeI * kParallelize6DTile1DRangeJ * + kParallelize6DTile1DRangeK * kParallelize6DTile1DRangeL * + kParallelize6DTile1DRangeM * kParallelize6DTile1DRangeN); } -static void ComputeNothing6DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { -} +static void ComputeNothing6DTile2D(void*, size_t, size_t, size_t, size_t, + size_t, size_t, size_t, size_t) {} TEST(Parallelize6DTile2D, SingleThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_2d(threadpool.get(), - ComputeNothing6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), ComputeNothing6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); } TEST(Parallelize6DTile2D, MultiThreadPoolCompletes) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - ComputeNothing6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); -} - -static void CheckBounds6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - EXPECT_LT(i, kParallelize6DTile2DRangeI); - EXPECT_LT(j, kParallelize6DTile2DRangeJ); - EXPECT_LT(k, kParallelize6DTile2DRangeK); - EXPECT_LT(l, kParallelize6DTile2DRangeL); - EXPECT_LT(start_m, kParallelize6DTile2DRangeM); - EXPECT_LT(start_n, kParallelize6DTile2DRangeN); - EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM); - EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), ComputeNothing6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); +} + +static void CheckBounds6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t start_n, size_t tile_m, + size_t tile_n) { + EXPECT_LT(i, kParallelize6DTile2DRangeI); + EXPECT_LT(j, kParallelize6DTile2DRangeJ); + EXPECT_LT(k, kParallelize6DTile2DRangeK); + EXPECT_LT(l, kParallelize6DTile2DRangeL); + EXPECT_LT(start_m, kParallelize6DTile2DRangeM); + EXPECT_LT(start_n, kParallelize6DTile2DRangeN); + EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM); + EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN); } TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - CheckBounds6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), CheckBounds6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); } TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsInBounds) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - CheckBounds6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), CheckBounds6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); } -static void CheckTiling6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - EXPECT_GT(tile_m, 0); - EXPECT_LE(tile_m, kParallelize6DTile2DTileM); - EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0); - EXPECT_EQ(tile_m, std::min(kParallelize6DTile2DTileM, kParallelize6DTile2DRangeM - start_m)); +static void CheckTiling6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t start_n, size_t tile_m, + size_t tile_n) { + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize6DTile2DTileM); + EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0); + EXPECT_EQ(tile_m, std::min(kParallelize6DTile2DTileM, + kParallelize6DTile2DRangeM - start_m)); - EXPECT_GT(tile_n, 0); - EXPECT_LE(tile_n, kParallelize6DTile2DTileN); - EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0); - EXPECT_EQ(tile_n, std::min(kParallelize6DTile2DTileN, kParallelize6DTile2DRangeN - start_n)); + EXPECT_GT(tile_n, 0); + EXPECT_LE(tile_n, kParallelize6DTile2DTileN); + EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0); + EXPECT_EQ(tile_n, std::min(kParallelize6DTile2DTileN, + kParallelize6DTile2DRangeN - start_n)); } TEST(Parallelize6DTile2D, SingleThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - CheckTiling6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), CheckTiling6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); } TEST(Parallelize6DTile2D, MultiThreadPoolUniformTiling) { - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - CheckTiling6DTile2D, - nullptr, - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); -} - -static void SetTrue6DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - processed_indicators[linear_idx].store(true, std::memory_order_relaxed); - } - } + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), CheckTiling6DTile2D, nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, + kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, + kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, /*flags=*/0); +} + +static void SetTrue6DTile2D(std::atomic_bool* processed_indicators, size_t i, + size_t j, size_t k, size_t l, size_t start_m, + size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } } TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue6DTile2D), - static_cast(indicators.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } + std::vector indicators( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue6DTile2D), + static_cast(indicators.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsProcessed) { - std::vector indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(SetTrue6DTile2D), - static_cast(indicators.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; - } - } - } - } - } - } -} - -static void Increment6DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); - } - } + std::vector indicators( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(SetTrue6DTile2D), + static_cast(indicators.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_TRUE( + indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } +} + +static void Increment6DTile2D(std::atomic_int* processed_counters, size_t i, + size_t j, size_t k, size_t l, size_t start_m, + size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment6DTile2D), - static_cast(counters.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment6DTile2D), + static_cast(counters.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedOnce) { - std::vector counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment6DTile2D), - static_cast(counters.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment6DTile2D), + static_cast(counters.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times (expected: 1)"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment6DTile2D), - static_cast(counters.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } + std::vector counters( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment6DTile2D), + static_cast(counters.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } } TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { - std::vector counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(Increment6DTile2D), - static_cast(counters.data()), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - } - - for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { - for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { - for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { - for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { - for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { - for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { - const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; - EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) - << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " - << counters[linear_idx].load(std::memory_order_relaxed) << " times " - << "(expected: " << kIncrementIterations6D << ")"; - } - } - } - } - } - } -} - -static void IncrementSame6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - for (size_t m = start_m; m < start_m + tile_m; m++) { - for (size_t n = start_n; n < start_n + tile_n; n++) { - num_processed_items->fetch_add(1, std::memory_order_relaxed); - } - } + std::vector counters( + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(Increment6DTile2D), + static_cast(counters.data()), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + } + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = + ((((i * kParallelize6DTile2DRangeJ + j) * + kParallelize6DTile2DRangeK + + k) * + kParallelize6DTile2DRangeL + + l) * + kParallelize6DTile2DRangeM + + m) * + kParallelize6DTile2DRangeN + + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), + kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l + << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) + << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } +} + +static void IncrementSame6DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t start_n, size_t tile_m, + size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + } + } } TEST(Parallelize6DTile2D, MultiThreadPoolHighContention) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(IncrementSame6DTile2D), - static_cast(&num_processed_items), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); -} - -static void WorkImbalance6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { - num_processed_items->fetch_add(tile_m * tile_n, std::memory_order_relaxed); - if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0 && start_n == 0) { - /* Spin-wait until all items are computed */ - while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN) { - std::atomic_thread_fence(std::memory_order_acquire); - } - } + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(IncrementSame6DTile2D), + static_cast(&num_processed_items), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); +} + +static void WorkImbalance6DTile2D(std::atomic_int* num_processed_items, + size_t i, size_t j, size_t k, size_t l, + size_t start_m, size_t start_n, size_t tile_m, + size_t tile_n) { + num_processed_items->fetch_add(tile_m * tile_n, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0 && start_n == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } } TEST(Parallelize6DTile2D, MultiThreadPoolWorkStealing) { - std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - - auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); - ASSERT_TRUE(threadpool.get()); - - if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { - GTEST_SKIP(); - } - - pthreadpool_parallelize_6d_tile_2d( - threadpool.get(), - reinterpret_cast(WorkImbalance6DTile2D), - static_cast(&num_processed_items), - kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, - kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, - 0 /* flags */); - EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast(WorkImbalance6DTile2D), + static_cast(&num_processed_items), kParallelize6DTile2DRangeI, + kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, + kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, + kParallelize6DTile2DRangeN, kParallelize6DTile2DTileM, + kParallelize6DTile2DTileN, /*flags=*/0); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), + kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * + kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * + kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); }