diff --git a/.circleci/config.yml b/.circleci/config.yml index 6778c353a4..fefe9da46f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,12 @@ -version: 2.0 +version: 2.1 jobs: - macos_x86_64_xcode10_cxx11_release: + + ### + ### macOS 10.14.4 (Mojave), XCode 10.3.0 (x86_64) + ### + + macos_x_64_86_xcode-103_cxx11_release: macos: xcode: "10.3.0" working_directory: ~/gismo @@ -14,12 +19,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode10_cxx11_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode10.3-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - macos_x86_64_xcode11_cxx14_release: + ### + ### macOS 10.15.5 (Catalina), XCode 11.7.0 (x86_64) + ### + + macos_x86_64_xcode-117_cxx14_release: macos: xcode: "11.7.0" working_directory: ~/gismo @@ -32,12 +41,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode11_cxx14_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode1.7-cxx14-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - macos_x86_64_xcode12_cxx17_release: + ### + ### macOS 11.4.0 (Big Sur), XCode 12.5.1 (x86_64) + ### + + macos_x86_64_xcode-125_cxx17_release: macos: xcode: "12.5.1" working_directory: ~/gismo @@ -50,15 +63,86 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode12_cxx17_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode-12.5-cxx17-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on MacOS + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + + ### + ### macOS 11.6.2 (Big Sur), XCode 13.2.1 (x86_64) + ### + + macos_x86_64_xcode-132_cxx20_release: + macos: + xcode: "13.2.1" + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: brew install cmake + - checkout + - run: + name: Configure G+Smo on MacOS + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode-13.2-cxx20-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=20 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + ### + ### Ubuntu 20.04 Linux, GCC 9.3 (aarch64) + ### + + linux_aarch64_gcc9_cxx11_release: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.medium + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: sudo apt-get update -y && sudo apt-get install cmake gcc g++ -y + - checkout + - run: + name: Configure G+Smo on Linux + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="Linux-aarch64-gcc9-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on Linux + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + + ### + ### Ubuntu 20.04 Linux, Clang 10 (aarch64) + ### + + linux_aarch64_clang10_cxx11_release: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.medium + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: sudo apt-get update -y && sudo apt-get install cmake clang -y + - checkout + - run: + name: Configure G+Smo on Linux + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="Linux-aarch64-clang10-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on Linux + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + workflows: - version: 2 + version: 2.1 build: jobs: - - macos_x86_64_xcode10_cxx11_release - - macos_x86_64_xcode11_cxx14_release - - macos_x86_64_xcode12_cxx17_release + - macos_x_64_86_xcode-103_cxx11_release + - macos_x86_64_xcode-117_cxx14_release + - macos_x86_64_xcode-125_cxx17_release + - macos_x86_64_xcode-132_cxx20_release + - linux_aarch64_gcc9_cxx11_release + - linux_aarch64_clang10_cxx11_release diff --git a/.github/workflows/gismo.yml b/.github/workflows/gismo.yml index 3d3afb8baf..da8ca3216b 100644 --- a/.github/workflows/gismo.yml +++ b/.github/workflows/gismo.yml @@ -36,4 +36,4 @@ jobs: # Note the current convention is to use the -S and -B options here to specify source # and build directories, but this is only available with CMake 3.13 and higher. # The CMake binaries on the Github Actions machines are (as of this writing) 3.12 - run: ctest -S gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="actions_$GITHUB_RUN_NUMBER" -D CTEST_SITE="${{ matrix.os }}_[actions]" -D CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE;-DCMAKE_CXX_STANDARD=11;-DGISMO_EXTRA_DEBUG=ON;-DGISMO_WITH_ONURBS=ON;-DGISMO_BUILD_UNITTESTS=ON" -Q + run: ctest -S gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="actions_$GITHUB_RUN_NUMBER" -D CTEST_SITE="${{ matrix.os }}_[actions]" -D CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE;-DCMAKE_CXX_STANDARD=14;-DGISMO_EXTRA_DEBUG=ON;-DGISMO_WITH_ONURBS=ON;-DGISMO_BUILD_UNITTESTS=ON" -Q diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2b704276d9..97086fb285 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,8 @@ # 3. Building and extensive testing of commits to branch 'ci_test' [to be added] # # 4. Coverity scan of commits to branch 'coverity_scan' [to be added] +# +# 5. Test installation and deployment ################################################################################ ################################################################################ @@ -22,16 +24,16 @@ # 'coverity_scan' and sending of the ctest results to the CDASH server ################################################################################ -#------------------------------------------------------------------------------- -# Clang 8-13, C++11,14,17,20,23 -#------------------------------------------------------------------------------- - -# Clang 8, C++11, Release -linux_x86_64_clang8_cxx11_release_double_int32t: +.test:linux:base: tags: - linux stage: test - image: ${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}/silkeh/clang:8 + image: $CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX/$IMAGE + variables: + BUILD_TYPE: "Release" + GENERATOR: "Ninja" + GISMO_SUBMODULES: "''" + LABELS_FOR_SUBPROJECTS: "'gismo;examples;unittests;doc-snippets'" script: - apt-get update -y - apt-get install cmake ninja-build -y @@ -101,23 +103,17 @@ linux_x86_64_clang13_cxx23_release_float_int: - coverity_scan #------------------------------------------------------------------------------- -# GCC 8-12, C++11,14,17,20 +# Clang 8-13, C++11,14,17,20 #------------------------------------------------------------------------------- -# GCC 8, C++11, Release -linux_x86_64_gcc8_cxx11_release_mpreal_long: - tags: - - linux - stage: test - image: ${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}/gcc:8 - script: - - apt-get update -y - - apt-get install cmake libmpfr-dev -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# Clang 8, C++11, Release +linux_x86_64_clang8_cxx11_release_double_int32t: + extends: .test:linux:base + variables: + IMAGE: "silkeh/clang:8" + CNAME: "/usr/local/bin/clang" + CXXNAME: "/usr/local/bin/clang++" + CMAKE_ARGS: "'-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_ONURBS=ON'" # GCC 9, C++14, Release linux_x86_64_gcc9_cxx14_release_longdouble_int64t: @@ -150,51 +146,101 @@ linux_x86_64_gcc10_cxx17_release_double_int32t: - ci_test - coverity_scan -# GCC 11, C++20, Release -linux_x86_64_gcc11_cxx20_release_float_int: - tags: - - linux - stage: test - image: ${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}/gcc:11 - script: - - apt-get update -y - - apt-get install cmake ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=float;-DGISMO_INDEX_TYPE=int;-DGISMO_WITH_ONURBS=ON' -D DO_TESTS=FALSE -Q - except: - - external_pull_requests - - ci_test - - coverity_scan - -# GCC 12 (latest), C++23, Release -linux_x86_64_gcc12_cxx23_release_float_int: - tags: - - linux - stage: test - image: ${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}/gcc:latest - script: - - apt-get update -y - - apt-get install cmake ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=float;-DGISMO_INDEX_TYPE=int;-DGISMO_WITH_ONURBS=ON' -D DO_TESTS=FALSE -Q - except: - - external_pull_requests - - ci_test - - coverity_scan - -################################################################################ -# 2. Building and testing of external pull requests (PRs) -################################################################################ +# # Clang 10, C++17, Release +# linux_x86_64_clang10_cxx17_release_mpreal_long: +# tags: +# - linux +# stage: test +# image: silkeh/clang:10 +# script: +# - apt-get update -y +# - apt-get install cmake libmpfr-dev ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -################################################################################ -# 3. Building and extensive testing of commits to branch 'ci_test' -################################################################################ +# # Clang 11, C++20, Release +# linux_x86_64_clang11_cxx20_release_mpq_long: +# tags: +# - linux +# stage: test +# image: silkeh/clang:11 +# script: +# - apt-get update -y +# - apt-get install cmake libgmp-dev -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpq_class;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -################################################################################ -# 4. Coverity scan of commits to branch 'coverity_scan' -################################################################################ -################################################################################ -# 5. Test installation and deployment -################################################################################ +# #------------------------------------------------------------------------------- +# # GCC 6-10, C++11,14,17,20 +# #------------------------------------------------------------------------------- + +# # GCC 7, C++11, Release +# linux_x86_64_gcc7_cxx11_release_mpreal_long: +# tags: +# - linux +# stage: test +# image: gcc:7 +# script: +# - apt-get update -y +# - apt-get install cmake libmpfr-dev -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan + +# # GCC 8, C++14, Release +# linux_x86_64_gcc8_cxx14_release_longdouble_int64t: +# tags: +# - linux +# stage: test +# image: gcc:8 +# script: +# - apt-get update -y +# - apt-get install cmake ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=14;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=long double;-DGISMO_INDEX_TYPE=int64_t;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan + +# # GCC 9, C++17, Release +# linux_x86_64_gcc9_cxx17_release_double_int32t: + +# tags: +# - linux +# stage: test +# image: gcc:9 +# script: +# - apt-get update -y +# - apt-get install cmake -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_OCC=ON;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan + +# # GCC 10, C++20, Release +# linux_x86_64_gcc10_cxx20_release_float_int: +# tags: +# - linux +# stage: test +# image: gcc:10 +# script: +# - apt-get update -y +# - apt-get install cmake ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=float;-DGISMO_INDEX_TYPE=int;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan # Standard installation and deployment on linux install_and_deploy_linux: diff --git a/CMakeLists.txt b/CMakeLists.txt index a1ea339ab8..691e309161 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,13 +39,26 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) if(NOT CMAKE_BUILD_TYPE) - # Set default build type to Release - set(CMAKE_BUILD_TYPE Release CACHE STRING - "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) - if(NOT CMAKE_CONFIGURATION_TYPES) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "RelWithDebInfo" "MinSizeRel") - endif() + # Set default build type to Release + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) + if(NOT CMAKE_CONFIGURATION_TYPES) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" + "RelWithDebInfo" "MinSizeRel") + endif() +endif() + +if(NOT CMAKE_CXX_STANDARD) + # Set default C++ standard + if (NOT CMAKE_CXX_STANDARD_DEFAULT) + set(CMAKE_CXX_STANDARD 11 CACHE STRING + "C++ standard (98, 11, 14, 17, 20)" FORCE) + else() + set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE STRING + "C++ standard (98, 11, 14, 17, 20)" FORCE) + endif() + set_property(CACHE CMAKE_CXX_STANDARD PROPERTY STRINGS "98" "11" "14" + "17" "20") endif() set(gismo_VERSION_MAJOR 21) #year @@ -59,9 +72,9 @@ if(CMAKE_QUIET) function(message) list(GET ARGV 0 MessageType) if(MessageType STREQUAL FATAL_ERROR OR - MessageType STREQUAL SEND_ERROR OR - MessageType STREQUAL WARNING OR - MessageType STREQUAL AUTHOR_WARNING) + MessageType STREQUAL SEND_ERROR OR + MessageType STREQUAL WARNING OR + MessageType STREQUAL AUTHOR_WARNING) list(REMOVE_AT ARGV 0) _message(${MessageType} "${ARGV}") endif() @@ -269,6 +282,10 @@ if(GISMO_WITH_SMESH) #include_directories(${SMESH_INCLUDE_DIR}) endif() +if(GISMO_WITH_XBRAID) + add_subdirectory(extensions/gsXBraid) +endif(GISMO_WITH_XBRAID) + #second time include_directories(${GISMO_INCLUDE_DIRS}) diff --git a/README.md b/README.md index 227e58b234..392f262f50 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,11 @@ |------------|------------|----------------------| | [CDash](https://cdash-ci.inria.fr/index.php?project=Gismo) | [![cdash](https://img.shields.io/website?down_color=lightgrey&down_message=offline&label=CDash&up_color=green&up_message=up&url=https%3A%2F%2Fcdash-ci.inria.fr%2Findex.php%3Fproject%3DGismo)](https://cdash-ci.inria.fr/index.php?project=Gismo) | Report results from all builds | | [Appveyor](https://ci.appveyor.com/project/gismo/gismo) | [![Appveyor status](https://ci.appveyor.com/api/projects/status/abps59xbt1gjwci1/branch/stable?svg=true)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[appVeyor]) | Windows MSVC 14.0 | -| [Circle CI](https://circleci.com/gh/gismo/gismo) | [![Circle CI](https://circleci.com/gh/gismo/gismo.svg?style=svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[cci]) | MacOS XCode9-12 | +| [Circle CI](https://circleci.com/gh/gismo/gismo) | [![Circle CI](https://circleci.com/gh/gismo/gismo.svg?style=svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[cci]) | MacOS XCode 10-13 | | [Codeship](https://app.codeship.com/projects/123289) | [![Codeship Status](https://app.codeship.com/projects/2aa19360-8998-0133-39fd-66416d65b267/status?branch=stable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[codeship]) | | | [GitLab](https://gitlab.com/gismo-ci/gismo/-/pipelines) | [![pipeline status](https://gitlab.com/gismo-ci/gismo/badges/gitlab_ci/pipeline.svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[gitlab-ci]) | Linux non-default configurations | | [GitHub Actions](https://github.com/gismo/gismo/actions) | [![Build Status](https://github.com/gismo/gismo/workflows/gismo/badge.svg?branch=stable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[actions]) | Latest Linux/MacOS/Windows | -| [Jenkins](https://ci.inria.fr/gismo/job/gismo/job/gismo/job/stable) | [![Build Status](https://ci.inria.fr/gismo/buildStatus/icon?job=gismo%2Fgismo%2Fstable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[jenkins]) |VMs for Linux/MacOS/Windows | +| [Jenkins](https://ci.inria.fr/gismo/job/gismo/job/gismo/job/stable) | [![Build Status](https://ci.inria.fr/gismo/buildStatus/icon?job=gismo%2Fgismo%2Fstable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[jenkins]) | VMs for Linux/MacOS/Windows | | GCC Farm | [Status](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[gccfarm]) | Builders from the GCC Farm | | [OBS](https://build.opensuse.org/package/show/home:filiatra/gismo) | [binaries](https://software.opensuse.org/download/package?project=home:filiatra&package=gismo) | Upstream package builds for many Linux distributions | | [Launchpad](https://code.launchpad.net/~g+smo/+recipe/g+smo-daily) |[binaries](https://launchpad.net/~g+smo/+archive/ubuntu/upstream/+packages) | Upstream package builds for Ubuntu distributions | @@ -117,14 +117,27 @@ Release, RelWithDebInfo, MinSizeRel. * GISMO_COEFF_TYPE *double* - The arithmetic type to be used for all computations. Available options -include double, long double, float. + The arithmetic type to be used for all computations. Available +options are float, double, long double, mpfr::mpreal, mpq_class, +posit_2_0, posit_3_0, posit_3_1, posit_4_0, posit_8_0, posit_8_1, +posit_16_1, posit_32_2, posit_64_3, posit_128_4, posit_256_5 * GISMO_EXTRA_INSTANCE *not set* If set to one or more of the options available for GISMO_COEFF_TYPE the G+Smo library is compiled with extra arithmetic types enabled. +* GISMO_INDEX_TYPE *int* + + The integer type to be used for all indices. Available options are +int, int8_t, int16_t, int32_t, int64_t, long, long long + +* GISMO_SHORT_TYPE *int* + + The integer type to be used for all non-index integers, e.g., the +spatial dimension. Available options are int, int8_t, int16_t, +int32_t, int64_t, long, long long + * GISMO_EXTRA_DEBUG *OFF* If set to ON additional debugging tools are enabled during @@ -167,6 +180,24 @@ compiled. The location for installation of the library, e.g. /usr/local on some Linux systems. +* TARGET_ARCHITECTURE *auto* + + If G+Smo is built in Release mode optimized compiler flags for the +selected target architecture are used. *auto* determines the +architecture of the host system automatically. Available options are +auto, generic, none, native and any value CPUID, e.g., skylake or +apple-m1. + +* TARGET_PROFILER *none* + + If G+Smo is build in Release mode compiler flags for the selected +target profiler are used. Available options are gprof and vtune (on +x86/x86_64 systems). + +* OFA_VERBOSE *OFF* + + If enabled the OptimizeForArchitecture script will produce verbose +output which might be helpful for debugging purposes. # Directory structure diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index f152b176e9..57cab1f783 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -1,188 +1,486 @@ -###################################################################### -## AddCXXConpileOptions.cmake -## This file is part of the G+Smo library. -## -## Authors: M. Moeller and A. Mantzaflaris -###################################################################### - -set(CMAKE_CXX_STANDARD_DEFAULT 14) - -if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI") - - # CMake does not yet provide flags for the Portland Group compiler - - # The Portland Group - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "$std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "$std=c++98") - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - else() - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - -endif() - -if (CMAKE_VERSION VERSION_LESS "3.1") - -if ((CMAKE_SYSTEM_NAME STREQUAL "Darwin") AND (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")) - - #also: -stdlib=libc++ - - # Apple Clang - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) - # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro greater than 201103L - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - - # LLVM Clang - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.1) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - # .. additionally requires gnu libstdc++ greater than 4.6 - # set(CMAKE_CXX_STANDARD_DEFAULT 14) - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - - # GNU Compiler Collection - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.3) - # Flag supported since 4.3 - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4) - # 4.3 supports 0x variants, but compliance is very low - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - -elseif ( "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntel") - - # Intel compiler - if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - set(_std -Qstd) - set(_ext c++) - else() - set(_std -std) - set(_ext gnu++) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.1) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "${_std}=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "${_std}=${_ext}98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}11") - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++14") - # todo: there is no gnu++14 value supported; figure out what to do - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}0x") - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++1y") - # todo: there is no gnu++14 value supported; figure out what to do - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - - unset(_std) - unset(_ext) - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro") - - # Oracle Solaris Studio - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.13) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - -endif() - -endif() # cmake 3.1 - -if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") -endif() - -# Apply for Cmake less than 3.1 -if (CMAKE_VERSION VERSION_LESS "3.1") - - if ( NOT "x${CMAKE_CXX_STANDARD}" STREQUAL "x98" AND - ${CMAKE_CXX_STANDARD_DEFAULT} LESS ${CMAKE_CXX_STANDARD}) - #message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT} (requested: ${CMAKE_CXX_STANDARD}).") - message(STATUS "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT}, CXX_STANDARD choice is changed.") - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") - endif() - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION}") -endif()#cmake<3.1 - - -# Bugfix for windows/msvc systems -if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") -endif() +###################################################################### +## AddCXXCompileOptions.cmake +## This file is part of the G+Smo library. +## +## Authors: M. Moeller and A. Mantzaflaris +###################################################################### + +# We strongly recommend to use an up-to-date cmake version which +# provides support for the most recent compiler version. We provide a +# subset of compiler options copied from cmake 3.17.5. +# +# The options below are only used if +# CMAKE_CXXvv_STANDARD_COMPILE_OPTIONS and +# CMAKE_CXXvv_EXTENSION_COMPILE_OPTIONS are not yet set by the regular +# cmake routines, where vv is the value of CMAKE_CXX_STANDARD. + +if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR + NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) + + message(WARNING "Update your CMake installation! We fall back to compiler options back ported from CMake 3.17.5") + + if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") + + # AppleClang + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro greater than 201103L + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang$") + + # ARMClang/Clang + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.1) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + # The MSVC standard library requires C++14, and MSVC itself has no + # notion of operating in a mode not aware of at least that standard. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++14") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "Cray") + + # Cray + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -h conform) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION -h gnu) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.4) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION -h std=c++11) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION -h std=c++11,gnu) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.6) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION -h std=c++14) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION -h std=c++14,gnu) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xFujitsu") + + # Configuration taken from CMake 3.22.0 + if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -std=c++03) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION -std=gnu++03) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION -std=c++11) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION -std=gnu++11) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION -std=c++14) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION -std=gnu++14) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION -std=c++17) + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION -std=gnu++17) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + + # GNU + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4) + # 4.3 supports 0x variants + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.1) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.1) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++20") + set(CMAKE_CXX23_STANDARD_COMPILE_OPTION "-std=c++23") + set(CMAKE_CXX23_EXTENSION_COMPILE_OPTION "-std=gnu++23") + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntel") + + # Intel classical + + if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + + set(CMAKE_CXX_CLANG_TIDY_DRIVER_MODE "cl") + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-Qstd=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-Qstd=c++17") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-Qstd=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-Qstd=c++14") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + else() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + endif() + + # Intel 15.0.2 accepts c++14 instead of c++1y, but not gnu++14 + # instead of gnu++1y. Intel 17.0.0 accepts gnu++14 too. + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 17.0) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") + + # Intel Clang-based + + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + + if(NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++20") + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++11") + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-Qstd=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-Qstd=c++14") + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-Qstd=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-Qstd=c++17") + + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-Qstd=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-Qstd=c++20") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") + + if((CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.0.24215.1 AND + CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.10) OR + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.10.25017) + + # VS 2015 Update 3 and above support language standard level flags, + # with the default and minimum level being C++14. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std:c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std:c++14") + + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.11.25505) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std:c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std:c++17") + else() + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std:c++latest") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.29.30129) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std:c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std:c++20") + set(CMAKE_CXX23_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX23_EXTENSION_COMPILE_OPTION "-std:c++latest") + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.12.25835) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std:c++latest") + endif() + + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0) + # MSVC has no specific options to set language standards, but set them as + # empty strings anyways so the feature test infrastructure can at least check + # to see if they are defined. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xPGI" OR + "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xNVHPC") + + # PGI/NVHPCV + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.10) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -A) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION --gnu_extensions) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.10) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION --c++11 -A) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION --c++11 --gnu_extensions) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.7) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14 -A) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION --c++14 --gnu_extensions) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION --c++17 -A) + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION --c++17 --gnu_extensions) + set(CMAKE_CXX17_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif() + endif() + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 20.11) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION -std=c++20) + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION -std=gnu++20) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xSunPro") + + # Oracle Solaris Studio + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.13) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++03") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=c++03") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX_LINK_WITH_STANDARD_COMPILE_OPTION 1) + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.14) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=c++14") + endif() + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-library=stlport4") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-library=stlport4") + set(CMAKE_CXX_LINK_WITH_STANDARD_COMPILE_OPTION 1) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xXL") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.1) + if(CMAKE_SYSTEM MATCHES "Linux") + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-qlanglvl=strict98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-qlanglvl=extended") + endif() + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + # XL does not really have full C++11 or C++14 support, but since we do not + # have a granular XL-CXX-FeatureTests table for it just pretend it does. + # This way projects that specify granular features will at least get a + # compiler mode for the corresponding standard. + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.1.0 AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-qlanglvl=extended1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-qlanglvl=extended1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif () + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xXLClang") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1.2) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif () + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.1.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + endif() + endif() + + endif() + +endif() + +# Bugfix for windows/msvc systems +if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") +endif() diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 6554933958..c0b7941d8f 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -16,26 +16,18 @@ # Optional inputs: # TARGET_ARCHITECTURE= specifies the target architecture (default=auto) # TARGET_PROFILER= specifies the target profiler (default=none) +# OFA_VERBOSE= prints verbose output (default=off) # -# If any of these flags are defined and set, the OptimizeForArchitecture -# macro will consequently disable the relevant features via compiler flags. -# -# For x86_64/x68: -# OFA_AVX512_INTRINSICS_BROKEN -# OFA_AVX2_INTRINSICS_BROKEN -# OFA_AVX_INTRINSICS_BROKEN -# OFA_FMA4_INTRINSICS_BROKEN -# OFA_SSE_INTRINSICS_BROKEN -# OFA_XOP_INTRINSICS_BROKEN -# -# For ARM: -# no options defined yet -# -# For PPC64: -# no options defined yet +# If any of the _broken flags are defined and set to true, +# the OptimizeForArchitecture macro will consequently disable the +# relevant features via compiler flags. # # Output: # OFA_ARCHITECTURE_FLAGS compiler flags optimized for the target architecture +# +# Internal variables: +# USE_ boolean variable holding the status of +# HAVE_ boolean variable holding the compiler;s capability #============================================================================= # Copyright 2010-2016 Matthias Kretz @@ -70,584 +62,23 @@ # # Changelog: # - Update of CPUIDs for latest Intel and AMD processors -# - Support for PPC64 (Clang, GCC, IBM XLC) -# - Support for ARM (Clang, GCC, ARM Clang) +# - Added support for PPC64 (Clang, GCC, IBM XLC) +# - Added Support for ARM (Clang, GCC, ARM Clang, Cray, Fujitsu) +# - Restructuring and splitting into multiple files #============================================================================= -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/AddCompilerFlag.cmake") -include(CheckIncludeFileCXX) - -macro(_my_find _list _value _ret) - list(FIND ${_list} "${_value}" _found) - if(_found EQUAL -1) - set(${_ret} FALSE) - else() - set(${_ret} TRUE) - endif() -endmacro(_my_find) - -macro(OFA_AutodetectX86) - set(_vendor_id) - set(_cpu_family) - set(_cpu_model) - set(_cpu_stepping) - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") - string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") - string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") - string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") - string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _vendor_id) - list(GET _sysctl_output 1 _cpu_family) - list(GET _sysctl_output 2 _cpu_model) - list(GET _sysctl_output 3 _cpu_stepping) - list(GET _sysctl_output 4 _cpu_flags) - string(TOLOWER "${_cpu_flags}" _cpu_flags) - string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") - get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) - get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) - mark_as_advanced(_vendor_id _cpu_id) - string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") - string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") - string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") - endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") - if(_vendor_id STREQUAL "GenuineIntel") - if(_cpu_family EQUAL 6) - # taken from the Intel ORM - # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html - # CPUID Signature Values of Of Recent Intel Microarchitectures - # 4E 5E | Skylake microarchitecture - # 3D 47 56 | Broadwell microarchitecture - # 3C 45 46 3F | Haswell microarchitecture - # 3A 3E | Ivy Bridge microarchitecture - # 2A 2D | Sandy Bridge microarchitecture - # 25 2C 2F | Intel microarchitecture Westmere - # 1A 1E 1F 2E | Intel microarchitecture Nehalem - # 17 1D | Enhanced Intel Core microarchitecture - # 0F | Intel Core microarchitecture - # - # Intel SDM Vol. 3C 35-1 / December 2016: - # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing] - # 85 | Future Xeon Phi - # 8E 9E | 7th gen. Core [Kaby Lake] - # 55 | Future Xeon [Skylake w/ AVX512] - # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512] - # 56 | Xeon D-1500 [Broadwell] - # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell] - # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell] - # 3D | M-5xxx / 5th gen. [Broadwell] - # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E] - # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell] - # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E] - # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge] - # 2D | Xeon E5, i7-39xx [Sandy Bridge] - # 2F | Xeon E7 - # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge] - # 2E | Xeon 7500, 6500 series - # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3 - # - # Values from the Intel SDE: - # 5C | Goldmont - # 5A | Silvermont - # 57 | Knights Landing - # 66 | Cannonlake - # 55 | Skylake Server - # 4E | Skylake Client - # 3C | Broadwell (likely a bug in the SDE) - # 3C | Haswell - # - # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid - if(_cpu_model EQUAL 133) # 85 - set(TARGET_ARCHITECTURE "knm") # Knights Mill - - elseif(_cpu_model EQUAL 87) # 57 - set(TARGET_ARCHITECTURE "knl") # Knights Landing - - elseif(_cpu_model EQUAL 134) - set(TARGET_ARCHITECTURE "tremont") - - elseif(_cpu_model EQUAL 122) - set(TARGET_ARCHITECTURE "goldmont-plus") - - elseif(_cpu_model EQUAL 92 OR _cpu_model EQUAL 95) - set(TARGET_ARCHITECTURE "goldmont") - - elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 93 OR _cpu_model EQUAL 74 OR _cpu_model EQUAL 76 OR _cpu_model EQUAL 77 OR _cpu_model EQUAL 55) - set(TARGET_ARCHITECTURE "silvermont") - - elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) - set(TARGET_ARCHITECTURE "bonnell") - - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "sapphirerapids") - - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "alderlake") - - elseif(_cpu_model EQUAL 140) - set(TARGET_ARCHITECTURE "tigerlake") - - elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) - set(TARGET_ARCHITECTURE "icelake-avx512") - - elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) - set(TARGET_ARCHITECTURE "icelake") - - elseif(_cpu_model EQUAL 102) - set(TARGET_ARCHITECTURE "cannonlake") - - elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) # 8E, 9E - set(TARGET_ARCHITECTURE "kabylake") - - elseif(_cpu_model EQUAL 85) # 55 - if(_cpu_stepping LESS 5) - set(TARGET_ARCHITECTURE "skylake-avx512") - elseif(_cpu_stepping LESS 8) - set(TARGET_ARCHITECTURE "cascadelake") - else() - set(TARGET_ARCHITECTURE "cooperlake") - endif() - - elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E - set(TARGET_ARCHITECTURE "skylake") - - elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) # 3D, 47, 4F, 56 - set(TARGET_ARCHITECTURE "broadwell") - - elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) - set(TARGET_ARCHITECTURE "haswell") - - elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) - set(TARGET_ARCHITECTURE "ivybridge") - - elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) - set(TARGET_ARCHITECTURE "sandybridge") - - elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) - set(TARGET_ARCHITECTURE "westmere") - - elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) - set(TARGET_ARCHITECTURE "nehalem") - - elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) - set(TARGET_ARCHITECTURE "penryn") - - elseif(_cpu_model EQUAL 15 OR _cpu_model EQUAL 22) - set(TARGET_ARCHITECTURE "merom") - - elseif(_cpu_model EQUAL 28) - set(TARGET_ARCHITECTURE "atom") - - elseif(_cpu_model EQUAL 14) - set(TARGET_ARCHITECTURE "core") - - elseif(_cpu_model LESS 14) - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") - set(TARGET_ARCHITECTURE "generic") - else() - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") - set(TARGET_ARCHITECTURE "merom") - endif() - - elseif(_cpu_family EQUAL 7) # Itanium (not supported) - message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") - - elseif(_cpu_family EQUAL 15) # NetBurst - list(APPEND _available_vector_units_list "sse" "sse2") - if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - endif() - - endif() - - elseif(_vendor_id STREQUAL "AuthenticAMD") - # taken from the list of AMD CPU microarchitectures - # https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures - # CPUID Signature Values of Of Recent AMD Microarchitectures - # 05 05h | K6 - # 06 06h | K7 - # 15 0Fh | K8 / Hammer - # 16 10h | K10 - # 17 11h | K8 & K10 "hybrid" - # 18 12h | K10 (Llano) / K12 (ARM based AMD cpu) - # 20 14h | Bobcat - # 21 15h | Bulldozer / Piledriver / Steamroller / Excavator - # 22 16h | Jaguar / Puma - # 23 17h | Zen / Zen+ / Zen 2 - # 24 18h | Hygon Dhyana - # 25 19h | Zen 3 - - if(_cpu_family EQUAL 25) # 19h - set(TARGET_ARCHITECTURE "zen3") # planned - - elseif(_cpu_family EQUAL 24) # 18h - set(TARGET_ARCHITECTURE "zen") - - elseif(_cpu_family EQUAL 23) # 17h - if(_cpu_model LESS 64) - set(TARGET_ARCHITECTURE "zen") - else() - set(TARGET_ARCHITECTURE "zen2") - endif() - - elseif(_cpu_family EQUAL 22) # 16h - set(TARGET_ARCHITECTURE "amd16h") - - elseif(_cpu_family EQUAL 21) # 15h - if(_cpu_model LESS 16) - set(TARGET_ARCHITECTURE "bulldozer") - elseif(_cpu_model LESS 32) - set(TARGET_ARCHITECTURE "piledriver") - elseif(_cpu_model LESS 64) - set(TARGET_ARCHITECTURE "steamroller") - else() - set(TARGET_ARCHITECTURE "excavator") - endif() - - elseif(_cpu_family EQUAL 20) # 14h - set(TARGET_ARCHITECTURE "amd14h") - - elseif(_cpu_family EQUAL 18) # 12h (K10 / K12) - - elseif(_cpu_family EQUAL 17) # 12h (K8 & K10 hybrid) - - elseif(_cpu_family EQUAL 16) # 10h (K10) - set(TARGET_ARCHITECTURE "barcelona") - - elseif(_cpu_family EQUAL 15) # 0Fh (K8 / Hammer) - if(_cpu_model LESS 39) - set(TARGET_ARCHITECTURE "k8") - else() - set(TARGET_ARCHITECTURE "k8-sse3") - endif() - - elseif(_cpu_family EQUAL 6) # 06h (K7) - elseif(_cpu_family EQUAL 5) # 05h (K6) - - endif() - endif() -endmacro(OFA_AutodetectX86) - -macro(OFA_AutodetectArm) - set(_cpu_implementer) - set(_cpu_architecture) - set(_cpu_variant) - set(_cpu_part) - set(_cpu_revision) - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*CPU implementer[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_implementer "${_cpuinfo}") - string(REGEX REPLACE ".*CPU architecture[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_architecture "${_cpuinfo}") - string(REGEX REPLACE ".*CPU variant[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_variant "${_cpuinfo}") - string(REGEX REPLACE ".*CPU part[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_part "${_cpuinfo}") - string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") - string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" OUTPUT_VARIABLE _sysctl_output_string) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _cpu_implementer) - list(GET _sysctl_output 1 _cpu_architecture) - list(GET _sysctl_output 2 _cpu_variant) - list(GET _sysctl_output 3 _cpu_part) - list(GET _sysctl_output 4 _cpu_revision) - elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") - endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") - - # Taken from https://github.com/karelzak/util-linux/blob/master/sys-utils/lscpu-arm.c - # and https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html - if(_cpu_implementer STREQUAL "0x41") # ARM - if(_cpu_part STREQUAL "0x810") - set(TARGET_ARCHITECTURE "arm810") - elseif(_cpu_part STREQUAL "0x920") - set(TARGET_ARCHITECTURE "arm920t") - elseif(_cpu_part STREQUAL "0x922") - set(TARGET_ARCHITECTURE "arm922t") - elseif(_cpu_part STREQUAL "0x926") - set(TARGET_ARCHITECTURE "arm926ej-s") - elseif(_cpu_part STREQUAL "0x940") - set(TARGET_ARCHITECTURE "arm940t") - elseif(_cpu_part STREQUAL "0x946") - set(TARGET_ARCHITECTURE "arm946e-s") - elseif(_cpu_part STREQUAL "0x966") - set(TARGET_ARCHITECTURE "arm966e-s") - elseif(_cpu_part STREQUAL "0xa20") - set(TARGET_ARCHITECTURE "arm1020e") - elseif(_cpu_part STREQUAL "0xa22") - set(TARGET_ARCHITECTURE "arm1022e") - elseif(_cpu_part STREQUAL "0xa26") - set(TARGET_ARCHITECTURE "arm1026ej-s") - elseif(_cpu_part STREQUAL "0xb02") - set(TARGET_ARCHITECTURE "mpcore") - elseif(_cpu_part STREQUAL "0xb36") - set(TARGET_ARCHITECTURE "arm1136jf-s") - elseif(_cpu_part STREQUAL "0xb56") - set(TARGET_ARCHITECTURE "arm1156t2f-s") - elseif(_cpu_part STREQUAL "0xb76") - set(TARGET_ARCHITECTURE "arm1176jzf-s") - elseif(_cpu_part STREQUAL "0xc05") - set(TARGET_ARCHITECTURE "cortex-a5") - elseif(_cpu_part STREQUAL "0xc07") - set(TARGET_ARCHITECTURE "cortex-a7") - elseif(_cpu_part STREQUAL "0xc08") - set(TARGET_ARCHITECTURE "cortex-a8") - elseif(_cpu_part STREQUAL "0xc09") - set(TARGET_ARCHITECTURE "cortex-a9") - elseif(_cpu_part STREQUAL "0xc0d") - set(TARGET_ARCHITECTURE "cortex-a12") - elseif(_cpu_part STREQUAL "0xc0f") - set(TARGET_ARCHITECTURE "cortex-a15") - elseif(_cpu_part STREQUAL "0xc0e") - set(TARGET_ARCHITECTURE "cortex-a17") - elseif(_cpu_part STREQUAL "0xc14") - set(TARGET_ARCHITECTURE "cortex-r4f") - elseif(_cpu_part STREQUAL "0xc15") - set(TARGET_ARCHITECTURE "cortex-r5") - elseif(_cpu_part STREQUAL "0xc17") - set(TARGET_ARCHITECTURE "cortex-r7") - elseif(_cpu_part STREQUAL "0xc18") - set(TARGET_ARCHITECTURE "cortex-r8") - elseif(_cpu_part STREQUAL "0xc20") - set(TARGET_ARCHITECTURE "cortex-m0") - elseif(_cpu_part STREQUAL "0xc21") - set(TARGET_ARCHITECTURE "cortex-m1") - elseif(_cpu_part STREQUAL "0xc23") - set(TARGET_ARCHITECTURE "cortex-m3") - elseif(_cpu_part STREQUAL "0xc24") - set(TARGET_ARCHITECTURE "cortex-m4") - elseif(_cpu_part STREQUAL "0xc27") - set(TARGET_ARCHITECTURE "cortex-m7") - elseif(_cpu_part STREQUAL "0xc60") - set(TARGET_ARCHITECTURE "cortex-m0plus") - elseif(_cpu_part STREQUAL "0xd01") - set(TARGET_ARCHITECTURE "cortex-a32") - elseif(_cpu_part STREQUAL "0xd02") - set(TARGET_ARCHITECTURE "cortex-a34") - elseif(_cpu_part STREQUAL "0xd03") - set(TARGET_ARCHITECTURE "cortex-a53") - elseif(_cpu_part STREQUAL "0xd04") - set(TARGET_ARCHITECTURE "cortex-a35") - elseif(_cpu_part STREQUAL "0xd05") - set(TARGET_ARCHITECTURE "cortex-a55") - elseif(_cpu_part STREQUAL "0xd07") - set(TARGET_ARCHITECTURE "cortex-a57") - elseif(_cpu_part STREQUAL "0xd08") - set(TARGET_ARCHITECTURE "cortex-a72") - elseif(_cpu_part STREQUAL "0xd09") - set(TARGET_ARCHITECTURE "cortex-a73") - elseif(_cpu_part STREQUAL "0xd0a") - set(TARGET_ARCHITECTURE "cortex-a75") - elseif(_cpu_part STREQUAL "0xd0b") - set(TARGET_ARCHITECTURE "cortex-a76") - elseif(_cpu_part STREQUAL "0xd0c") - set(TARGET_ARCHITECTURE "neoverse-n1") - elseif(_cpu_part STREQUAL "0xd0d") - set(TARGET_ARCHITECTURE "cortex-a77") - elseif(_cpu_part STREQUAL "0xd0e") - set(TARGET_ARCHITECTURE "cortex-a76ae") - elseif(_cpu_part STREQUAL "0xd13") - set(TARGET_ARCHITECTURE "cortex-r52") - elseif(_cpu_part STREQUAL "0xd20") - set(TARGET_ARCHITECTURE "cortex-m23") - elseif(_cpu_part STREQUAL "0xd21") - set(TARGET_ARCHITECTURE "cortex-m33") - elseif(_cpu_part STREQUAL "0xd4a") - set(TARGET_ARCHITECTURE "neoverse-e1") - endif() - - elseif(_cpu_implementer STREQUAL "0x42") # Broadcom - if(_cpu_part STREQUAL "0x0f") - set(TARGET_ARCHITECTURE "brahma-b15") - elseif(_cpu_part STREQUAL "0x100") - set(TARGET_ARCHITECTURE "brahma-b53") - elseif(_cpu_part STREQUAL "0x516") - set(TARGET_ARCHITECTURE "thunderx2") - endif() - - elseif(_cpu_implementer STREQUAL "0x43") # Cavium - if(_cpu_part STREQUAL "0x0a0") - set(TARGET_ARCHITECTURE "thunderx") - elseif(_cpu_part STREQUAL "0x0a1") - set(TARGET_ARCHITECTURE "thunderxt88") - elseif(_cpu_part STREQUAL "0x0a2") - set(TARGET_ARCHITECTURE "thunderxt81") - elseif(_cpu_part STREQUAL "0x0a3") - set(TARGET_ARCHITECTURE "thunderxt83") - elseif(_cpu_part STREQUAL "0x0af") - set(TARGET_ARCHITECTURE "thunderx2t99") - endif() - - elseif(_cpu_implementer STREQUAL "0x44") # DEC - if(_cpu_part STREQUAL "0xa10") - set(TARGET_ARCHITECTURE "strongarm110") - elseif(_cpu_part STREQUAL "0xa11") - set(TARGET_ARCHITECTURE "strongarm1100") - endif() - - elseif(_cpu_implementer STREQUAL "0x46") # FUJITSU - if(_cpu_part STREQUAL "0x001") - set(TARGET_ARCHITECTURE "a64fx") - endif() - - elseif(_cpu_implementer STREQUAL "0x48") # HiSilicon - if(_cpu_part STREQUAL "0xd01") - set(TARGET_ARCHITECTURE "tsv110") - endif() - - elseif(_cpu_implementer STREQUAL "0x4e") # Nvidia - if(_cpu_part STREQUAL "0x000") - set(TARGET_ARCHITECTURE "denver") - elseif(_cpu_part STREQUAL "0x003") - set(TARGET_ARCHITECTURE "denver2") - elseif(_cpu_part STREQUAL "0x004") - set(TARGET_ARCHITECTURE "carmel") - endif() - - elseif(_cpu_implementer STREQUAL "0x50") # APM - if(_cpu_part STREQUAL "0x000") - set(TARGET_ARCHITECTURE "xgene1") - endif() - - elseif(_cpu_implementer STREQUAL "0x51") # Qualcomm - if(_cpu_part STREQUAL "0x00f") - set(TARGET_ARCHITECTURE "scorpion") - elseif(_cpu_part STREQUAL "0x02d") - set(TARGET_ARCHITECTURE "scorpion") - elseif(_cpu_part STREQUAL "0x04d") - set(TARGET_ARCHITECTURE "krait") - elseif(_cpu_part STREQUAL "0x06f") - set(TARGET_ARCHITECTURE "krait") - elseif(_cpu_part STREQUAL "0x201") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x205") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x211") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x800") - set(TARGET_ARCHITECTURE "falkor") - elseif(_cpu_part STREQUAL "0x801") - set(TARGET_ARCHITECTURE "kryo2") - elseif(_cpu_part STREQUAL "0xc00") - set(TARGET_ARCHITECTURE "falkor") - elseif(_cpu_part STREQUAL "0xc01") - set(TARGET_ARCHITECTURE "saphira") - endif() - - elseif(_cpu_implementer STREQUAL "0x53") # Samsung - if(_cpu_part STREQUAL "0x001") - set(TARGET_ARCHITECTURE "exynos-m1") - endif() - - elseif(_cpu_implementer STREQUAL "0x56") # Marvell - if(_cpu_part STREQUAL "0x131") - set(TARGET_ARCHITECTURE "marvell-f") - elseif(_cpu_part STREQUAL "0x581") - set(TARGET_ARCHITECTURE "marvell-pj4") - elseif(_cpu_part STREQUAL "0x584") - set(TARGET_ARCHITECTURE "marvell-pj4") - endif() - - elseif(_cpu_implementer STREQUAL "0x66") # Faraday - if(_cpu_part STREQUAL "0x526") - set(TARGET_ARCHITECTURE "fa526") - elseif(_cpu_part STREQUAL "0x626") - set(TARGET_ARCHITECTURE "fa626") - endif() - - elseif(_cpu_implementer STREQUAL "0x69") # Intel - if(_cpu_part STREQUAL "0x200") - set(TARGET_ARCHITECTURE "i80200") - elseif(_cpu_part STREQUAL "0x210") - set(TARGET_ARCHITECTURE "pxa250a") - elseif(_cpu_part STREQUAL "0x212") - set(TARGET_ARCHITECTURE "pxa210a") - elseif(_cpu_part STREQUAL "0x242") - set(TARGET_ARCHITECTURE "i80321-400") - elseif(_cpu_part STREQUAL "0x243") - set(TARGET_ARCHITECTURE "i80321-600") - elseif(_cpu_part STREQUAL "0x290") - set(TARGET_ARCHITECTURE "pxa250b") - elseif(_cpu_part STREQUAL "0x292") - set(TARGET_ARCHITECTURE "pxa210b") - elseif(_cpu_part STREQUAL "0x2c2") - set(TARGET_ARCHITECTURE "i80321-400-b0") - elseif(_cpu_part STREQUAL "0x2c3") - set(TARGET_ARCHITECTURE "i80321-600-b0") - elseif(_cpu_part STREQUAL "0x2d0") - set(TARGET_ARCHITECTURE "pxa250c") - elseif(_cpu_part STREQUAL "0x2d2") - set(TARGET_ARCHITECTURE "pxa210c") - elseif(_cpu_part STREQUAL "0x411") - set(TARGET_ARCHITECTURE "pxa27x") - elseif(_cpu_part STREQUAL "0x41c") - set(TARGET_ARCHITECTURE "ipx425-533") - elseif(_cpu_part STREQUAL "0x41d") - set(TARGET_ARCHITECTURE "ipx425-400") - elseif(_cpu_part STREQUAL "0x41f") - set(TARGET_ARCHITECTURE "ipx425-266") - elseif(_cpu_part STREQUAL "0x682") - set(TARGET_ARCHITECTURE "pxa32x") - elseif(_cpu_part STREQUAL "0x683") - set(TARGET_ARCHITECTURE "pxa930") - elseif(_cpu_part STREQUAL "0x688") - set(TARGET_ARCHITECTURE "pxa30x") - elseif(_cpu_part STREQUAL "0x689") - set(TARGET_ARCHITECTURE "pxa31x") - elseif(_cpu_part STREQUAL "0xb11") - set(TARGET_ARCHITECTURE "sa1110") - elseif(_cpu_part STREQUAL "0xc12") - set(TARGET_ARCHITECTURE "ipx1200") - endif() - - # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h - elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 - if(_cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) - set(TARGET_ARCHITECTURE "apple-a6") - elseif(_cpu_part STREQUAL "0x37a09642" OR _cpu_part STREQUAL "933271106") # Cyclone (A7) - set(TARGET_ARCHITECTURE "apple-a7") - elseif(_cpu_part STREQUAL "0x2c91a47e" OR _cpu_part STREQUAL "747742334") # Typhoon (A8) - set(TARGET_ARCHITECTURE "apple-a8") - elseif(_cpu_part STREQUAL "0x92fb37c8" OR _cpu_part STREQUAL "2465937352") # Twister (A9) - set(TARGET_ARCHITECTURE "apple-a9") - elseif(_cpu_part STREQUAL "0x67ceee93" OR _cpu_part STREQUAL "1741614739") # Hurrican (A10) - set(TARGET_ARCHITECTURE "apple-a10") - elseif(_cpu_part STREQUAL "0xe81e7ef6" OR _cpu_part STREQUAL "3894312694") # Monsoon Mistral (A11) - set(TARGET_ARCHITECTURE "apple-a11") - elseif(_cpu_part STREQUAL "0x07d34b9f" OR _cpu_part STREQUAL "131287967") # Vortex Tempest (A12) - set(TARGET_ARCHITECTURE "apple-a12") - elseif(_cpu_part STREQUAL "0x462504d2" OR _cpu_part STREQUAL "1176831186") # Lightning Thunder (A13) - set(TARGET_ARCHITECTURE "apple-a13") - elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (M1) - set(TARGET_ARCHITECTURE "apple-m1") - endif() - endif() -endmacro(OFA_AutodetectArm) - -macro(OFA_AutodetectPpc) - set(_cpu) +#============================================================================= +# Autodetection of CPU +#============================================================================= - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*cpu[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu "${_cpuinfo}") - if(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") - set(TARGET_ARCHITECTURE "power8") - elseif(_cpu STREQUAL "POWER9" OR _cpu STREQUAL "POWER9NVL") - set(TARGET_ARCHITECTURE "power9") - elseif(_cpu STREQUAL "POWER10" OR _cpu STREQUAL "POWER10NVL") - set(TARGET_ARCHITECTURE "power10") - endif() - elseif(CMAKE_SYSTEM_NAME STREQUAL "AIX") - endif() -endmacro(OFA_AutodetectPpc) +include(ofa/AutodetectX86) +include(ofa/AutodetectArm) +include(ofa/AutodetectPpc) macro(OFA_AutodetectHostArchitecture) set(TARGET_ARCHITECTURE "generic") set(OFA_ARCHITECTURE_FLAGS) + if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") OFA_AutodetectX86() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") @@ -655,1323 +86,36 @@ macro(OFA_AutodetectHostArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_AutodetectPpc() else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture") endif() endmacro(OFA_AutodetectHostArchitecture) -macro(OFA_HandleX86Options) - set(_march_flag_list) - set(_available_vector_units_list) - macro(_nehalem) - list(APPEND _march_flag_list "nehalem") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") - endmacro() - macro(_westmere) - list(APPEND _march_flag_list "westmere") - _nehalem() - endmacro() - macro(_sandybridge) - list(APPEND _march_flag_list "sandybridge") - list(APPEND _march_flag_list "corei7-avx") - _westmere() - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") - endmacro() - macro(_ivybridge) - list(APPEND _march_flag_list "ivybridge") - list(APPEND _march_flag_list "core-avx-i") - _sandybridge() - list(APPEND _available_vector_units_list "rdrnd" "f16c") - endmacro() - macro(_haswell) - list(APPEND _march_flag_list "haswell") - list(APPEND _march_flag_list "core-avx2") - _ivybridge() - list(APPEND _available_vector_units_list "avx2" "fma" "bmi" "bmi2") - endmacro() - macro(_broadwell) - list(APPEND _march_flag_list "broadwell") - _haswell() - list(APPEND _available_vector_units_list "rdseed") - endmacro() - macro(_skylake) - list(APPEND _march_flag_list "skylake") - _broadwell() - endmacro() - macro(_skylake_avx512) - list(APPEND _march_flag_list "skylake-avx512") - _skylake() - list(APPEND _available_vector_units_list "avx512f" "avx512cd" "avx512dq" "avx512bw" "avx512vl") - endmacro() - macro(_cascadelake) - list(APPEND _march_flag_list "cascadelake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512vnni") - endmacro() - macro(_cooperlake) - list(APPEND _march_flag_list "cooperlake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bf16" "avx512vnni") - endmacro() - macro(_cannonlake) - list(APPEND _march_flag_list "cannonlake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512ifma" "avx512vbmi") - endmacro() - macro(_icelake) - list(APPEND _march_flag_list "icelake-client") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi2" "avx512vbmi" "avx512vnni" "avx512vpopcntdq" "rdpid") - endmacro() - macro(_icelake_avx512) - list(APPEND _march_flag_list "icelake-server") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi2" "avx512vbmi" "avx512vnni" "avx512vpopcntdq" "rdpid") - endmacro() - macro(_tigerlake) - list(APPEND _march_flag_list "tigerlake") - _icelake() - list(APPEND _available_vector_units_list "avx512vp2intersect") - endmacro() - macro(_alderlake) - list(APPEND _march_flag_list "alderlake") - _tigerlake() - endmacro() - macro(_sapphirerapids) - list(APPEND _march_flag_list "sapphirerapids") - _icelake_avx512() - endmacro() - macro(_knightslanding) - list(APPEND _march_flag_list "knl") - _broadwell() - list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd") - endmacro() - macro(_knightsmill) - list(APPEND _march_flag_list "knm") - _broadwell() - list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") - endmacro() - macro(_silvermont) - list(APPEND _march_flag_list "silvermont") - _westmere() - list(APPEND _available_vector_units_list "rdrnd") - endmacro() - macro(_goldmont) - list(APPEND _march_flag_list "goldmont") - _silvermont() - list(APPEND _available_vector_units_list "rdseed") - endmacro() - macro(_goldmont_plus) - list(APPEND _march_flag_list "goldmont-plus") - _goldmont() - list(APPEND _available_vector_units_list "rdpid") - endmacro() - macro(_tremont) - list(APPEND _march_flag_list "tremont") - _goldmont_plus() - endmacro() - - if(TARGET_ARCHITECTURE STREQUAL "core") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "merom") - list(APPEND _march_flag_list "merom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "penryn") - list(APPEND _march_flag_list "penryn") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") - if(_cpu_flags MATCHES "sse4_1") - message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)") - list(APPEND _available_vector_units_list "sse4.1") - else() - message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") - endif() - elseif(TARGET_ARCHITECTURE STREQUAL "knm") - _knightsmill() - elseif(TARGET_ARCHITECTURE STREQUAL "knl") - _knightslanding() - elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") - _sapphirerapids() - elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") - _alderlake() - elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") - _tigerlake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake") - _icelake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") - _icelake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") - _cannonlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") - _cooperlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") - _cascadelake() - elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") - _skylake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") - _broadwell() - elseif(TARGET_ARCHITECTURE STREQUAL "haswell") - _haswell() - elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") - _ivybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") - _sandybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "westmere") - _westmere() - elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") - _nehalem() - elseif(TARGET_ARCHITECTURE STREQUAL "tremont") - _tremont() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") - _goldmont_plus() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") - _goldmont() - elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") - _silvermont() - elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") - list(APPEND _march_flag_list "bonnell") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "atom") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "k8") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2") - elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") - list(APPEND _march_flag_list "k8-sse3") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") - list(APPEND _march_flag_list "btver2") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen3") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen2") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "excavator") - list(APPEND _march_flag_list "bdver4") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") - elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() - - if(TARGET_ARCHITECTURE STREQUAL "native") - if(MSVC) - message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - # ICC (on Linux) - AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icl\\.exe)$") - # ICC (on Windows) - AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(pgcc|pgc\\+\\+)$") - # PGI (on Linux) - AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(suncc|sunCC)$") - # Sun/Oracle Studio (on Linux/Sun OS) - AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - else() - AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) - message(STATUS "SSE disabled because of old/broken toolchain") - set(_sse_broken true) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - elseif(DEFINED OFA_AVX_INTRINSICS_BROKEN AND OFA_AVX_INTRINSICS_BROKEN) - message(STATUS "AVX disabled because of old/broken toolchain") - set(_sse_broken false) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - else() - set(_sse_broken false) - set(_avx_broken false) - if(DEFINED OFA_FMA4_INTRINSICS_BROKEN AND OFA_FMA4_INTRINSICS_BROKEN) - message(STATUS "FMA4 disabled because of old/broken toolchain") - set(_fma4_broken true) - else() - set(_fma4_broken false) - endif() - if(DEFINED OFA_XOP_INTRINSICS_BROKEN AND OFA_XOP_INTRINSICS_BROKEN) - message(STATUS "XOP disabled because of old/broken toolchain") - set(_xop_broken true) - else() - set(_xop_broken false) - endif() - if(DEFINED OFA_AVX2_INTRINSICS_BROKEN AND OFA_AVX2_INTRINSICS_BROKEN) - message(STATUS "AVX2 disabled because of old/broken toolchain") - set(_avx2_broken true) - else() - set(_avx2_broken false) - endif() - if(DEFINED OFA_AVX512_INTRINSICS_BROKEN AND OFA_AVX512_INTRINSICS_BROKEN) - message(STATUS "AVX512 disabled because of old/broken toolchain") - set(_avx512_broken true) - else() - set(_avx512_broken false) - endif() - endif() - - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) - _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) - _enable_or_disable(AVX512BF16 "avx512bf16" "Use AVX512BF16." _avx512_broken) - _enable_or_disable(AVX512BITALG "avx512bitalg" "Use AVX512BITALG." _avx512_broken) - _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." _avx512_broken) - _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." _avx512_broken) - _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." _avx512_broken) - _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." _avx512_broken) - _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." _avx512_broken) - _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." _avx512_broken) - _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." _avx512_broken) - _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." _avx512_broken) - _enable_or_disable(AVX512VBMI2 "avx512vbmi2" "Use AVX512VBMI2." _avx512_broken) - _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) - _enable_or_disable(AVX512VNNI "avx512vnni" "Use AVX512VNNI." _avx512_broken) - _enable_or_disable(AVX512VP2INTERSECT "avx512vp2intersect" "Use AVX512VP2INTERSECT." _avx512_broken) - _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq" "Use AVX512VPOPCNTDQ." _avx512_broken) - _enable_or_disable(BMI "bmi2" "Use BMI." _avx_broken) - _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) - _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) - _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) - _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." _sse_broken) - _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) - - if(MSVC AND MSVC_VERSION GREATER 1700) - # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) - # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) - _my_find(_enable_vector_unit_list "avx2" _found) - if(_found) - AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "avx" _found) - if(_found) - AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "sse2" _found) - if(_found) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() - foreach(_flag ${_enable_vector_unit_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) - set(OFA_map_knl "-xMIC-AVX512") - set(OFA_map_knm "-xMIC-AVX512") - set(OFA_map_sapphirerapids "-xCORE-AVX512") - set(OFA_map_alderlake "-xCORE-AVX512") - set(OFA_map_tigerlake "-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xCORE-AVX512") - set(OFA_map_icelake "-xCORE-AVX512") - set(OFA_map_cannonlake "-xCORE-AVX512") - set(OFA_map_cooperlake "-xCORE-AVX512") - set(OFA_map_cascadelake "-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xCORE-AVX512") - set(OFA_map_skylake "-xCORE-AVX2") - set(OFA_map_broadwell "-xCORE-AVX2") - set(OFA_map_haswell "-xCORE-AVX2") - set(OFA_map_ivybridge "-xCORE-AVX-I") - set(OFA_map_sandybridge "-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) - foreach(arch ${_march_flag_list}) - if(DEFINED OFA_map_${arch}) - AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) - if(_ok) - break() - endif() - endif() - endforeach() - if(NOT _ok) - # This is the Intel compiler, so SSE2 is a very reasonable baseline. - message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2") - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - else() # not MSVC and not ICC => GCC, Clang, Open64 - foreach(_flag ${_march_flag_list}) - AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-m${_flag}" CXX_RESULT _result) - if(_result) - set(_header FALSE) - if(_flag STREQUAL "sse3") - set(_header "pmmintrin.h") - elseif(_flag STREQUAL "ssse3") - set(_header "tmmintrin.h") - elseif(_flag STREQUAL "sse4.1") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4.2") - set(_header "nmmintrin.h") - elseif(_flag STREQUAL "sse4a") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "avx") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx2") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx512*") - set(_header "immintrin.h") - elseif(_flag STREQUAL "fma4") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "xop") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "bmi") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "bmi2") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "rdrnd") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdpid") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdseed") - set(_header "immintrin.h") - endif() - set(_resultVar "HAVE_${_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - if(_header) - CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "disabling ${_useVar} because ${_header} is missing") - set(${_useVar} FALSE) - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endif() - if(NOT _header OR ${_resultVar}) - list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") - endif() - endif() - endforeach(_flag) - foreach(_flag ${_disable_vector_unit_list}) - AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - endif() - endif() - - # Compile code with profiling instrumentation - if(TARGET_PROFILER STREQUAL "gprof") - AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(TARGET_PROFILER STREQUAL "vtune" AND CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() -endmacro(OFA_HandleX86Options) - -macro(OFA_HandleArmOptions) - set(_march_flag_list) - set(_mtune_flag_list) - set(_available_vector_units_list) - - if(TARGET_ARCHITECTURE STREQUAL "strongarm") # ARM - list(APPEND _mtune_flag_list "strongarm") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm8") - list(APPEND _mtune_flag_list "arm8") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm810") - list(APPEND _mtune_flag_list "arm810") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa526") - list(APPEND _mtune_flag_list "fa526") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626") - list(APPEND _mtune_flag_list "fa626") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") - list(APPEND _mtune_flag_list "arm7tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") - list(APPEND _mtune_flag_list "arm7tdmi-s") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") - list(APPEND _mtune_flag_list "arm710t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") - list(APPEND _mtune_flag_list "arm720t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") - list(APPEND _mtune_flag_list "arm740t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9") - list(APPEND _mtune_flag_list "arm9") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") - list(APPEND _mtune_flag_list "arm9tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920") - list(APPEND _mtune_flag_list "arm920") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") - list(APPEND _mtune_flag_list "arm920t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") - list(APPEND _mtune_flag_list "arm922t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") - list(APPEND _mtune_flag_list "arm940t") - list(APPEND _march_flag_list "armv4t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") - list(APPEND _mtune_flag_list "arm1020t") - list(APPEND _march_flag_list "armv5t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") - list(APPEND _mtune_flag_list "arm10tdmi") - list(APPEND _march_flag_list "armv5t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") - list(APPEND _mtune_flag_list "arm9e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") - list(APPEND _mtune_flag_list "arm946e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") - list(APPEND _mtune_flag_list "arm966e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") - list(APPEND _mtune_flag_list "arm968e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") - list(APPEND _mtune_flag_list "arm10e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") - list(APPEND _mtune_flag_list "arm1020e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") - list(APPEND _mtune_flag_list "arm1022e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "xscale") - list(APPEND _mtune_flag_list "xscale") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") - list(APPEND _mtune_flag_list "iwmmxt") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") - list(APPEND _mtune_flag_list "iwmmxt2") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") - list(APPEND _mtune_flag_list "fa606te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") - list(APPEND _mtune_flag_list "fa626te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") - list(APPEND _mtune_flag_list "fmp626") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") - list(APPEND _mtune_flag_list "fa726te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") - list(APPEND _mtune_flag_list "arm926ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") - list(APPEND _mtune_flag_list "arm1026ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") - list(APPEND _mtune_flag_list "mpcore") - list(APPEND _march_flag_list "armv6k") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") - list(APPEND _mtune_flag_list "arm1136j-s") - list(APPEND _march_flag_list "armv6j") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") - list(APPEND _mtune_flag_list "arm1136jf-s") - list(APPEND _march_flag_list "armv6j") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") - list(APPEND _mtune_flag_list "arm1156t2-s") - list(APPEND _march_flag_list "armv6t2") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") - list(APPEND _mtune_flag_list "arm1156t2f-s") - list(APPEND _march_flag_list "armv6t2") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") - list(APPEND _mtune_flag_list "arm1176jz-s") - list(APPEND _march_flag_list "armv6kz") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") - list(APPEND _mtune_flag_list "arm1176jzf-s") - list(APPEND _march_flag_list "armv6kz") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") - list(APPEND _mtune_flag_list "generic-armv7-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") - list(APPEND _mtune_flag_list "cortex-a5") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") - list(APPEND _mtune_flag_list "cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") - list(APPEND _mtune_flag_list "cortex-a8") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "sec" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") - list(APPEND _mtune_flag_list "cortex-a9") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") - list(APPEND _mtune_flag_list "cortex-a12") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") - list(APPEND _mtune_flag_list "cortex-a15") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") - list(APPEND _mtune_flag_list "cortex-a17") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") - list(APPEND _mtune_flag_list "cortex-a32") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") - list(APPEND _mtune_flag_list "cortex-a34") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") - list(APPEND _mtune_flag_list "cortex-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") - list(APPEND _mtune_flag_list "cortex-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") - list(APPEND _mtune_flag_list "cortex-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") - list(APPEND _mtune_flag_list "cortex-a57") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") - list(APPEND _mtune_flag_list "cortex-a72") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") - list(APPEND _mtune_flag_list "cortex-a73") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") - list(APPEND _mtune_flag_list "cortex-a75") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") - list(APPEND _mtune_flag_list "cortex-a76") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") - list(APPEND _mtune_flag_list "cortex-a76ae") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") - list(APPEND _mtune_flag_list "cortex-a77") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") - list(APPEND _mtune_flag_list "cortex-r4") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") - list(APPEND _mtune_flag_list "cortex-r4f") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") - list(APPEND _mtune_flag_list "cortex-r5") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") - list(APPEND _mtune_flag_list "cortex-r7") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") - list(APPEND _mtune_flag_list "cortex-r8") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") - list(APPEND _mtune_flag_list "cortex-r52") - list(APPEND _march_flag_list "armv8-r") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") - list(APPEND _mtune_flag_list "cortex-m0") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") - list(APPEND _mtune_flag_list "cortex-m0plus") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") - list(APPEND _mtune_flag_list "cortex-m1") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") - list(APPEND _mtune_flag_list "cortex-m3") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") - list(APPEND _mtune_flag_list "cortex-m4") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") - list(APPEND _mtune_flag_list "cortex-m7") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp.dp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") - list(APPEND _mtune_flag_list "cortex-m23") - list(APPEND _march_flag_list "armv8-m.base") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") - list(APPEND _mtune_flag_list "cortex-m33") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") - list(APPEND _mtune_flag_list "cortex-m35p") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") - list(APPEND _mtune_flag_list "cortex-m55") - list(APPEND _march_flag_list "armv8.1-m.main") - list(APPEND _march_flag_list "armv8-m") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "mve.fp" "fp.dp") - - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") - list(APPEND _mtune_flag_list "neoverse-n1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") - list(APPEND _mtune_flag_list "neoverse-e1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dorprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") # Broadcom - list(APPEND _mtune_flag_list "brahma-b15") - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") - list(APPEND _mtune_flag_list "brahma-b53") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") - list(APPEND _mtune_flag_list "thunderx2") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto") - - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") # Cavium - list(APPEND _mtune_flag_list "thunderx") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") - list(APPEND _mtune_flag_list "thunderxt88") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") - list(APPEND _mtune_flag_list "thunderxt81") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") - list(APPEND _mtune_flag_list "thunderxt83") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") - list(APPEND _mtune_flag_list "thunderx2t99") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") # DEC - list(APPEND _mtune_flag_list "strongarm110") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") - list(APPEND _mtune_flag_list "strongarm1100") - list(APPEND _march_flag_list "armv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") # FUJITSU - list(APPEND _mtune_flag_list "a64fx") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "sve") - - elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") # HiSilicon - list(APPEND _mtune_flag_list "tsv110") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - - elseif(TARGET_ARCHITECTURE STREQUAL "denver") # Nvidia - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "denver2") - list(APPEND _mtune_flag_list "denver2") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "carmel") - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") # APM - list(APPEND _mtune_flag_list "xgene1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - - elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") # Qualcomm - list(APPEND _mtune_flag_list "scorpion") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "krait") - list(APPEND _mtune_flag_list "krait") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo") - list(APPEND _mtune_flag_list "kryo") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") - list(APPEND _mtune_flag_list "kryo2") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "falkor") - list(APPEND _mtune_flag_list "falkor") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "saphira") - list(APPEND _mtune_flag_list "saphira") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") # Samsung - list(APPEND _mtune_flag_list "exynos-m1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto" "simd") - - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") # Marvell - list(APPEND _mtune_flag_list "marvell-f") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") - list(APPEND _mtune_flag_list "marvell-pj4") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "i80200") # Intel - list(APPEND _mtune_flag_list "i80200") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") - list(APPEND _mtune_flag_list "pxa250a") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") - list(APPEND _mtune_flag_list "pxa210a") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") - list(APPEND _mtune_flag_list "i80321-400") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") - list(APPEND _mtune_flag_list "i80321-600") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") - list(APPEND _mtune_flag_list "pxa250b") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") - list(APPEND _mtune_flag_list "pxa210b") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") - list(APPEND _mtune_flag_list "i80321-400-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") - list(APPEND _mtune_flag_list "i80321-600-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") - list(APPEND _mtune_flag_list "pxa250c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") - list(APPEND _mtune_flag_list "pxa210c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") - list(APPEND _mtune_flag_list "pxa27x") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") - list(APPEND _mtune_flag_list "ipx425-533") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") - list(APPEND _mtune_flag_list "ipx425-400") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") - list(APPEND _mtune_flag_list "ipx425-266") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") - list(APPEND _mtune_flag_list "pxa32x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") - list(APPEND _mtune_flag_list "pxa930") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") - list(APPEND _mtune_flag_list "pxa30x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") - list(APPEND _mtune_flag_list "pxa31x") - elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") - list(APPEND _mtune_flag_list "sa1110") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") - list(APPEND _mtune_flag_list "ipx1200") - - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") # Apple - list(APPEND _mtune_flag_list "apple-a6") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") - list(APPEND _mtune_flag_list "apple-a7") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") - list(APPEND _mtune_flag_list "apple-a8") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") - list(APPEND _mtune_flag_list "apple-a9") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") - list(APPEND _mtune_flag_list "apple-a10") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") - list(APPEND _mtune_flag_list "apple-a11") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") - list(APPEND _mtune_flag_list "apple-a12") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") - list(APPEND _mtune_flag_list "apple-a13") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") - list(APPEND _mtune_flag_list "vortex") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() - - # Following the recommendation from - # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu - # we first try to only use the -mcpu flag. If that fails, e.g., of - # the compiler does not yet support the specified target, we try to - # set the -march and -mtune flags as fallback option. - - if(TARGET_ARCHITECTURE STREQUAL "native") - AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - foreach(_flag ${_mtune_flag_list}) - - AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - - if(NOT _good) - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - foreach(_march ${_march_flag_list}) - - AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") - endif(_good) - endforeach(_flag) - AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - break() - endif(_good) - endforeach(_march) - - foreach(_mtune ${_mtune_flag_list}) - AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_mtune) - endif(NOT _good) - endif() - - # Compile code with profiling instrumentation - if(TARGET_PROFILER STREQUAL "gprof") - AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() -endmacro(OFA_HandleArmOptions) - -macro(OFA_HandlePpcOptions) - set(_march_flag_list) - if(TARGET_ARCHITECTURE STREQUAL "power8") - list(APPEND _march_flag_list "power8") - list(APPEND _march_flag_list "pwr8") - elseif(TARGET_ARCHITECTURE STREQUAL "power9") - list(APPEND _march_flag_list "power9") - list(APPEND _march_flag_list "power8") - list(APPEND _march_flag_list "pwr9") - list(APPEND _march_flag_list "pwr8") - elseif(TARGET_ARCHITECTURE STREQUAL "power10") - list(APPEND _march_flag_list "power10") - list(APPEND _march_flag_list "power9") - list(APPEND _march_flag_list "power8") - list(APPEND _march_flag_list "pwr10") - list(APPEND _march_flag_list "pwr9") - list(APPEND _march_flag_list "pwr8") - endif() +#============================================================================= +# Handling of CPU options +#============================================================================= - foreach(_flag ${_march_flag_list}) - if(CMAKE_CXX_COMPILER MATCHES "/(pgcc|pgc\\+\\+)$") - # PGI (on Linux) - AddCompilerFlag("-tp=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - else() - AddCompilerFlag("-mcpu=${_mcpu}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - if(_good) - break() - endif(_good) - endforeach(_flag) - - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - AddCompilerFlag("-target powerpcle-unknown-linux-gnu" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() -endmacro(OFA_HandlePpcOptions) +include(ofa/HandleX86Options) +include(ofa/HandleArmOptions) +include(ofa/HandlePpcOptions) macro(OptimizeForArchitecture) if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"core2\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a78\", \"cortex-a78ae\", \"cortex-a76c\", \"cortex-a510\", \"cortex-a710\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"cortex-x1\", \"cortex-x2\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"neoverse-n2\", \"neoverse-v1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Other supported values are: \"none\", \"generic\", \"power8\", \"power9\", \"power10\".") else() - message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture.cmake") + message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture") + endif() + + if(NOT OFA_VERBOSE) + set(CMAKE_REQUIRED_QUIET true) endif() + set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") - message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") + message(STATUS "Target architecture changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") set(_force FORCE) endif() set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) @@ -1983,6 +127,7 @@ macro(OptimizeForArchitecture) message(STATUS "Detected Host CPU: ${TARGET_ARCHITECTURE}") endif() + message(STATUS "Checking Host CPU features. This can take some time ...") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") OFA_HandleX86Options() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") diff --git a/cmake/gsConfig.cmake b/cmake/gsConfig.cmake index 396c47153e..356393825f 100644 --- a/cmake/gsConfig.cmake +++ b/cmake/gsConfig.cmake @@ -50,18 +50,18 @@ if(NOT GISMO_INDEX_TYPE) set (GISMO_INDEX_TYPE "int" CACHE STRING #math(EXPR BITSZ_VOID_P "8*${CMAKE_SIZEOF_VOID_P}") #set (GISMO_INDEX_TYPE "int${BITSZ_VOID_P}_t" CACHE STRING - "Index type(int, int32_t, int64_t, long, long long)" FORCE) + "Index type(int, int8_t, int16_t, int32_t, int64_t, long, long long)" FORCE) set_property(CACHE GISMO_INDEX_TYPE PROPERTY STRINGS - "int" "int32_t" "int64_t" "long" "long long" ) + "int" "int8_t" "int16_t" "int32_t" "int64_t" "long" "long long" ) endif() -# Set a default build type if none was specified -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING - "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) - # Set the possible values of build type for cmake-gui - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "RelWithDebInfo" "MinSizeRel") +if(NOT GISMO_SHORT_TYPE) + set (GISMO_SHORT_TYPE "int" CACHE STRING + #math(EXPR BITSZ_VOID_P "8*${CMAKE_SIZEOF_VOID_P}") + #set (GISMO_INDEX_TYPE "int${BITSZ_VOID_P}_t" CACHE STRING + "Short type(int, int8_8, int16_t, int32_t, int64_t, long, long long)" FORCE) + set_property(CACHE GISMO_SHORT_TYPE PROPERTY STRINGS + "int" "int8_t" "int16_t" "int32_t" "int64_t" "long" "long long" ) endif() set(${PROJECT_NAME}_ARCHIVE_OUTPUT_DIRECTORY lib) @@ -104,7 +104,7 @@ if(GISMO_BUILD_COVERAGE AND CMAKE_COMPILER_IS_GNUCXX) APPEND_COVERAGE_COMPILER_FLAGS() #set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ftest-coverage -fprofile-arcs") #set(CMAKE_EXE_LINKER_FLAGS "-fprofile-arcs -ftest-coverage") -endif(GISMO_BUILD_COVERAGE AND CMAKE_COMPILER_IS_GNUCXX) +endif() if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") @@ -140,7 +140,7 @@ endif() if(GISMO_EXTRA_DEBUG) include(gsDebugExtra) -endif(GISMO_EXTRA_DEBUG) +endif() if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") # Force to always compile with W4 @@ -266,12 +266,11 @@ endif() #string(TOUPPER ${CMAKE_BUILD_TYPE} TEMP) #message(STATUS "Using compilation flags: ${CMAKE_CXX_FLAGS}, ${CMAKE_CXX_FLAGS_${TEMP}}") -if("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease") - #https://github.com/VcDevel/Vc/blob/master/cmake/OptimizeForArchitecture.cmake +if("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease" AND ${CMAKE_VERSION} VERSION_GREATER "3.1.0") include( OptimizeForArchitecture ) OptimizeForArchitecture() foreach (flag ${OFA_ARCHITECTURE_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}") endforeach() -endif("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease") +endif() diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index 8f932ea896..5bbdc6dfab 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -19,12 +19,12 @@ if(EXISTS "${CMAKE_SOURCE_DIR}/.git") endif() endif() message (" CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") -message (" CMAKE_C_COMPILER ${CMAKE_C_COMPILER}") message (" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}") message (" CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}") message (" GISMO_COEFF_TYPE ${GISMO_COEFF_TYPE}") message (" GISMO_INDEX_TYPE ${GISMO_INDEX_TYPE}") +message (" GISMO_SHORT_TYPE ${GISMO_SHORT_TYPE}") ## ################################################################# ## Options list: Standard options @@ -178,6 +178,11 @@ if (${GISMO_WITH_UNUM}) message (" GISMO_WITH_UNUM ${GISMO_WITH_UNUM}") endif() +option(GISMO_WITH_XBRAID "With XBraid" false ) +if (${GISMO_WITH_XBRAID}) +message (" GISMO_WITH_XBRAID ${GISMO_WITH_XBRAID}") +endif() + ## ################################################################# ## Options list: Extra options ## ################################################################# diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake new file mode 100644 index 0000000000..e0b1537c6d --- /dev/null +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -0,0 +1,201 @@ +# Add a given compiler flag to flag variables. +# +# Usage: +# AddCXXCompilerFlag( +# [CODE ] +# [EXTRA_FLAGS ] +# [FLAGS ] +# [HEADERS ] +# [RESULT ] +# [TESTS ]) +# +# Input argument: +# flag to be added after succesful completion of all tests +# +# Optional input arguments: +# CODE variable holding the test code; this overrides the +# automatic generation of the test code +# EXTRA_FLAGS variable holding the list of extra compiler flags that +# are used without checks +# FLAGS variable holding the list of flags to which is +# added after succesful completion of all tests +# HEADERS variable holding the list of header files prepended to +# the C++ test code's main function +# TESTS variable holding the list of tests to be included in +# the C++ test code's main function body +# +# Output argument: +# RESULT variable holding the result of all tests + +#============================================================================= +# This code is largely inspired by +# +# AddCompilerFlag.cmake +# Copyright 2010-2015 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# and +# +# CheckCXXCompilerFlag.cmake +# Copyright 2006-2009 Kitware, Inc. +# Copyright 2006 Alexander Neundorf +# Copyright 2011-2013 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * The names of Kitware, Inc., the Insight Consortium, or the names of +# any consortium members, or of any contributors, may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================= + +macro(AddCXXCompilerFlag _flag) + set(state 0) + unset(_code) + unset(_extra_flags) + unset(_flags) + unset(_headers) + unset(_result) + unset(_tests) + + foreach(_arg ${ARGN}) + if("x${_arg}" STREQUAL "xCODE") + set(state 1) + elseif("x${_arg}" STREQUAL "xEXTRA_FLAGS") + set(state 2) + elseif("x${_arg}" STREQUAL "xFLAGS") + set(state 3) + elseif("x${_arg}" STREQUAL "xHEADERS") + set(state 4) + elseif("x${_arg}" STREQUAL "xRESULT") + set(state 5) + elseif("x${_arg}" STREQUAL "xTESTS") + set(state 6) + + elseif(state EQUAL 1) + set(_code ${_arg}) + elseif(state EQUAL 2) + set(_extra_flags ${_arg}) + elseif(state EQUAL 3) + set(_flags ${_arg}) + elseif(state EQUAL 4) + set(_headers ${_arg}) + elseif(state EQUAL 5) + set(_result ${_arg}) + elseif(state EQUAL 6) + set(_tests ${_arg}) + else() + message(FATAL_ERROR "The argument ${_arg} is not supported by AddCXXCompilerFlag") + endif() + endforeach() + + set(_check_include_file_cxx TRUE) + set(_check_cxx_source_compiles TRUE) + + # Check availability of header file(s) + foreach(_header ${_headers}) + set(_resultVar "HAVE_${_header}") + string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") + check_include_file_cxx(${_header} ${_resultVar} "${_flag}${_extra_flags}") + + if(NOT ${_resultVar}) + set(_check_include_file_cxx FALSE) + endif() + endforeach() + + # Check if compiler supports flag and can compile code + set(_cxx_code) + foreach(_header ${_headers}) + set(_cxx_code "${_cxx_code}\n#include<${_header}>") + endforeach() + + if(_code) + set(_cxx_code "${_cxx_code}\n${_code}") + elseif(_tests) + set(_cxx_code "${_cxx_code}\nint main() {") + foreach(_test ${_tests}) + set(_cxx_code "${_cxx_code}\n${_test}") + endforeach() + set(_cxx_code "${_cxx_code}\nreturn 0; }") + else() + set(_cxx_code "${_cxx_code}\nint main() { return 0; }") + endif() + + set(_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "${_flag}${_extra_flags}") + set(_resultVar "HAVE_${_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") + check_cxx_source_compiles("${_cxx_code}" ${_resultVar} + # Some compilers do not fail with a bad flag + FAIL_REGEX "error: bad value (.*) for .* switch" # GNU + FAIL_REGEX "argument unused during compilation" # clang + FAIL_REGEX "warning: the flag .* has been deprecated" # clang + FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU + FAIL_REGEX "unrecognized .*option" # GNU + FAIL_REGEX "ignored for target" # GNU + FAIL_REGEX "ignoring unknown option" # MSVC + FAIL_REGEX "warning D9002" # MSVC + FAIL_REGEX "[Uu]nknown option" # HP + FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro + FAIL_REGEX "[Ww]arning: illegal use of -xarch option" # SunPro + FAIL_REGEX "command option .* is not recognized" # XL + FAIL_REGEX "WARNING: unknown flag:" # Open64 + FAIL_REGEX "command line error" # ICC + FAIL_REGEX "command line warning" # ICC + FAIL_REGEX "#10236:" # ICC: File not found + FAIL_REGEX " #10159: " # ICC + FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' + FAIL_REGEX " #10006: " # ICC: ignoring unknown option '-mavx512fp16' + ) + set(CMAKE_REQUIRED_FLAGS "${_CMAKE_REQUIRED_FLAGS}") + + if(NOT ${_resultVar}) + set(_check_cxx_source_compiles FALSE) + endif() + + if (DEFINED _result) + if (${_check_include_file_cxx} AND ${_check_cxx_source_compiles}) + set(${_result} TRUE) + else() + set(${_result} FALSE) + endif() + endif() + + if(DEFINED _flags AND ${_check_include_file_cxx} AND ${_check_cxx_source_compiles}) + list(APPEND ${_flags} "${_flag}") + endif() +endmacro(AddCXXCompilerFlag) diff --git a/cmake/AddCompilerFlag.cmake b/cmake/ofa/AddCompilerFlag.cmake similarity index 96% rename from cmake/AddCompilerFlag.cmake rename to cmake/ofa/AddCompilerFlag.cmake index a2cf7b5c79..5d989b16e6 100644 --- a/cmake/AddCompilerFlag.cmake +++ b/cmake/ofa/AddCompilerFlag.cmake @@ -34,11 +34,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/CheckCCompilerFlag.cmake") -include("${_currentDir}/CheckCXXCompilerFlag.cmake") -include("${_currentDir}/CheckMicCCompilerFlag.cmake") -include("${_currentDir}/CheckMicCXXCompilerFlag.cmake") +include(ofa/CheckCCompilerFlag) +include(ofa/CheckCXXCompilerFlag) +include(ofa/CheckMicCCompilerFlag) +include(ofa/CheckMicCXXCompilerFlag) macro(AddCompilerFlag _flag) string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") diff --git a/cmake/ofa/AutodetectArm.cmake b/cmake/ofa/AutodetectArm.cmake new file mode 100644 index 0000000000..9285d3ab1e --- /dev/null +++ b/cmake/ofa/AutodetectArm.cmake @@ -0,0 +1,390 @@ +#============================================================================= +# Autodetection of ARM / ARM64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectArm) + set(_cpu_implementer) + set(_cpu_architecture) + set(_cpu_variant) + set(_cpu_part) + set(_cpu_revision) + + # Get CPUID from system + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*CPU implementer[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_implementer "${_cpuinfo}") + string(REGEX REPLACE ".*CPU architecture[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_architecture "${_cpuinfo}") + string(REGEX REPLACE ".*CPU variant[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_variant "${_cpuinfo}") + string(REGEX REPLACE ".*CPU part[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_part "${_cpuinfo}") + string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") + string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + exec_program("/usr/sbin/sysctl -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_architecture) + list(GET _sysctl_output 2 _cpu_variant) + list(GET _sysctl_output 3 _cpu_part) + list(GET _sysctl_output 4 _cpu_revision) + endif() + if(_error) + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + endif() + + # TODO: Windows, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + + # Determine CPU from CPUID + # Taken from https://github.com/karelzak/util-linux/blob/master/sys-utils/lscpu-arm.c + # and https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html + + # ARM + if(_cpu_implementer STREQUAL "0x41") + if(_cpu_part STREQUAL "0x810") + set(TARGET_ARCHITECTURE "arm810") + elseif(_cpu_part STREQUAL "0x920") + set(TARGET_ARCHITECTURE "arm920t") + elseif(_cpu_part STREQUAL "0x922") + set(TARGET_ARCHITECTURE "arm922t") + elseif(_cpu_part STREQUAL "0x926") + set(TARGET_ARCHITECTURE "arm926ej-s") + elseif(_cpu_part STREQUAL "0x940") + set(TARGET_ARCHITECTURE "arm940t") + elseif(_cpu_part STREQUAL "0x946") + set(TARGET_ARCHITECTURE "arm946e-s") + elseif(_cpu_part STREQUAL "0x966") + set(TARGET_ARCHITECTURE "arm966e-s") + elseif(_cpu_part STREQUAL "0xa20") + set(TARGET_ARCHITECTURE "arm1020e") + elseif(_cpu_part STREQUAL "0xa22") + set(TARGET_ARCHITECTURE "arm1022e") + elseif(_cpu_part STREQUAL "0xa26") + set(TARGET_ARCHITECTURE "arm1026ej-s") + elseif(_cpu_part STREQUAL "0xb02") + set(TARGET_ARCHITECTURE "mpcore") + elseif(_cpu_part STREQUAL "0xb36") + set(TARGET_ARCHITECTURE "arm1136jf-s") + elseif(_cpu_part STREQUAL "0xb56") + set(TARGET_ARCHITECTURE "arm1156t2f-s") + elseif(_cpu_part STREQUAL "0xb76") + set(TARGET_ARCHITECTURE "arm1176jzf-s") + elseif(_cpu_part STREQUAL "0xc05") + set(TARGET_ARCHITECTURE "cortex-a5") + elseif(_cpu_part STREQUAL "0xc07") + set(TARGET_ARCHITECTURE "cortex-a7") + elseif(_cpu_part STREQUAL "0xc08") + set(TARGET_ARCHITECTURE "cortex-a8") + elseif(_cpu_part STREQUAL "0xc09") + set(TARGET_ARCHITECTURE "cortex-a9") + elseif(_cpu_part STREQUAL "0xc0d") + set(TARGET_ARCHITECTURE "cortex-a12") + elseif(_cpu_part STREQUAL "0xc0f") + set(TARGET_ARCHITECTURE "cortex-a15") + elseif(_cpu_part STREQUAL "0xc0e") + set(TARGET_ARCHITECTURE "cortex-a17") + elseif(_cpu_part STREQUAL "0xc14") + set(TARGET_ARCHITECTURE "cortex-r4f") + elseif(_cpu_part STREQUAL "0xc15") + set(TARGET_ARCHITECTURE "cortex-r5") + elseif(_cpu_part STREQUAL "0xc17") + set(TARGET_ARCHITECTURE "cortex-r7") + elseif(_cpu_part STREQUAL "0xc18") + set(TARGET_ARCHITECTURE "cortex-r8") + elseif(_cpu_part STREQUAL "0xc20") + set(TARGET_ARCHITECTURE "cortex-m0") + elseif(_cpu_part STREQUAL "0xc21") + set(TARGET_ARCHITECTURE "cortex-m1") + elseif(_cpu_part STREQUAL "0xc23") + set(TARGET_ARCHITECTURE "cortex-m3") + elseif(_cpu_part STREQUAL "0xc24") + set(TARGET_ARCHITECTURE "cortex-m4") + elseif(_cpu_part STREQUAL "0xc27") + set(TARGET_ARCHITECTURE "cortex-m7") + elseif(_cpu_part STREQUAL "0xc60") + set(TARGET_ARCHITECTURE "cortex-m0plus") + elseif(_cpu_part STREQUAL "0xd01") + set(TARGET_ARCHITECTURE "cortex-a32") + elseif(_cpu_part STREQUAL "0xd02") + set(TARGET_ARCHITECTURE "cortex-a34") + elseif(_cpu_part STREQUAL "0xd03") + set(TARGET_ARCHITECTURE "cortex-a53") + elseif(_cpu_part STREQUAL "0xd04") + set(TARGET_ARCHITECTURE "cortex-a35") + elseif(_cpu_part STREQUAL "0xd05") + set(TARGET_ARCHITECTURE "cortex-a55") + elseif(_cpu_part STREQUAL "0xd07") + set(TARGET_ARCHITECTURE "cortex-a57") + elseif(_cpu_part STREQUAL "0xd08") + set(TARGET_ARCHITECTURE "cortex-a72") + elseif(_cpu_part STREQUAL "0xd09") + set(TARGET_ARCHITECTURE "cortex-a73") + elseif(_cpu_part STREQUAL "0xd0a") + set(TARGET_ARCHITECTURE "cortex-a75") + elseif(_cpu_part STREQUAL "0xd0b") + set(TARGET_ARCHITECTURE "cortex-a76") + elseif(_cpu_part STREQUAL "0xd0c") + set(TARGET_ARCHITECTURE "neoverse-n1") + elseif(_cpu_part STREQUAL "0xd0d") + set(TARGET_ARCHITECTURE "cortex-a77") + elseif(_cpu_part STREQUAL "0xd0e") + set(TARGET_ARCHITECTURE "cortex-a76ae") + elseif(_cpu_part STREQUAL "0xd13") + set(TARGET_ARCHITECTURE "cortex-r52") + elseif(_cpu_part STREQUAL "0xd20") + set(TARGET_ARCHITECTURE "cortex-m23") + elseif(_cpu_part STREQUAL "0xd21") + set(TARGET_ARCHITECTURE "cortex-m33") + elseif(_cpu_part STREQUAL "0xd40") + set(TARGET_ARCHITECTURE "neoverse-v1") + elseif(_cpu_part STREQUAL "0xd41") + set(TARGET_ARCHITECTURE "cortex-a78") + elseif(_cpu_part STREQUAL "0xd42") + set(TARGET_ARCHITECTURE "cortex-a78ae") + elseif(_cpu_part STREQUAL "0xd44") + set(TARGET_ARCHITECTURE "cortex-x1") + elseif(_cpu_part STREQUAL "0xd46") + set(TARGET_ARCHITECTURE "cortex-a510") + elseif(_cpu_part STREQUAL "0xd47") + set(TARGET_ARCHITECTURE "cortex-a710") + elseif(_cpu_part STREQUAL "0xd48") + set(TARGET_ARCHITECTURE "cortex-x2") + elseif(_cpu_part STREQUAL "0xd49") + set(TARGET_ARCHITECTURE "neoverse-n2") + elseif(_cpu_part STREQUAL "0xd4a") + set(TARGET_ARCHITECTURE "neoverse-e1") + elseif(_cpu_part STREQUAL "0xd4b") + set(TARGET_ARCHITECTURE "cortex-a78c") + endif() + + # Broadcom + elseif(_cpu_implementer STREQUAL "0x42") + if(_cpu_part STREQUAL "0x0f") + set(TARGET_ARCHITECTURE "brahma-b15") + elseif(_cpu_part STREQUAL "0x100") + set(TARGET_ARCHITECTURE "brahma-b53") + elseif(_cpu_part STREQUAL "0x516") + set(TARGET_ARCHITECTURE "thunderx2") + endif() + + # Cavium + elseif(_cpu_implementer STREQUAL "0x43") + if(_cpu_part STREQUAL "0x0a0") + set(TARGET_ARCHITECTURE "thunderx") + elseif(_cpu_part STREQUAL "0x0a1") + set(TARGET_ARCHITECTURE "thunderxt88") + elseif(_cpu_part STREQUAL "0x0a2") + set(TARGET_ARCHITECTURE "thunderxt81") + elseif(_cpu_part STREQUAL "0x0a3") + set(TARGET_ARCHITECTURE "thunderxt83") + elseif(_cpu_part STREQUAL "0x0af") + set(TARGET_ARCHITECTURE "thunderx2t99") + endif() + + # DEC + elseif(_cpu_implementer STREQUAL "0x44") + if(_cpu_part STREQUAL "0xa10") + set(TARGET_ARCHITECTURE "strongarm110") + elseif(_cpu_part STREQUAL "0xa11") + set(TARGET_ARCHITECTURE "strongarm1100") + endif() + + # FUJITSU + elseif(_cpu_implementer STREQUAL "0x46") + if(_cpu_part STREQUAL "0x001") + set(TARGET_ARCHITECTURE "a64fx") + endif() + + # HiSilicon + elseif(_cpu_implementer STREQUAL "0x48") + if(_cpu_part STREQUAL "0xd01") + set(TARGET_ARCHITECTURE "tsv110") + endif() + + # Infineon + elseif(_cpu_implementer STREQUAL "0x49") + + # Motorola/Freescale + elseif(_cpu_implementer STREQUAL "0x4d") + + # Nvidia + elseif(_cpu_implementer STREQUAL "0x4e") + if(_cpu_part STREQUAL "0x000") + set(TARGET_ARCHITECTURE "denver") + elseif(_cpu_part STREQUAL "0x003") + set(TARGET_ARCHITECTURE "denver2") + elseif(_cpu_part STREQUAL "0x004") + set(TARGET_ARCHITECTURE "carmel") + endif() + + # APM + elseif(_cpu_implementer STREQUAL "0x50") + if(_cpu_part STREQUAL "0x000") + set(TARGET_ARCHITECTURE "xgene1") + endif() + + # Qualcomm + elseif(_cpu_implementer STREQUAL "0x51") + if(_cpu_part STREQUAL "0x00f") + set(TARGET_ARCHITECTURE "scorpion") + elseif(_cpu_part STREQUAL "0x02d") + set(TARGET_ARCHITECTURE "scorpion") + elseif(_cpu_part STREQUAL "0x04d") + set(TARGET_ARCHITECTURE "krait") + elseif(_cpu_part STREQUAL "0x06f") + set(TARGET_ARCHITECTURE "krait") + elseif(_cpu_part STREQUAL "0x201") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x205") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x211") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x800") + set(TARGET_ARCHITECTURE "falkor") + elseif(_cpu_part STREQUAL "0x801") + set(TARGET_ARCHITECTURE "kryo2") + elseif(_cpu_part STREQUAL "0xc00") + set(TARGET_ARCHITECTURE "falkor") + elseif(_cpu_part STREQUAL "0xc01") + set(TARGET_ARCHITECTURE "saphira") + endif() + + # Samsung + elseif(_cpu_implementer STREQUAL "0x53") + if(_cpu_part STREQUAL "0x001") + set(TARGET_ARCHITECTURE "exynos-m1") + endif() + + # Marvell + elseif(_cpu_implementer STREQUAL "0x56") + if(_cpu_part STREQUAL "0x131") + set(TARGET_ARCHITECTURE "marvell-f") + elseif(_cpu_part STREQUAL "0x581") + set(TARGET_ARCHITECTURE "marvell-pj4") + elseif(_cpu_part STREQUAL "0x584") + set(TARGET_ARCHITECTURE "marvell-pj4") + endif() + + # Apple + elseif(_cpu_implementer STREQUAL "0x61") + if(_cpu_part STREQUAL "0x022") + set(TARGET_ARCHITECTURE "icestorm") + elseif(_cpu_part STREQUAL "0x023") + set(TARGET_ARCHITECTURE "firestorm") + endif() + + # Faraday + elseif(_cpu_implementer STREQUAL "0x66") + if(_cpu_part STREQUAL "0x526") + set(TARGET_ARCHITECTURE "fa526") + elseif(_cpu_part STREQUAL "0x626") + set(TARGET_ARCHITECTURE "fa626") + endif() + + # Intel + elseif(_cpu_implementer STREQUAL "0x69") + if(_cpu_part STREQUAL "0x200") + set(TARGET_ARCHITECTURE "i80200") + elseif(_cpu_part STREQUAL "0x210") + set(TARGET_ARCHITECTURE "pxa250a") + elseif(_cpu_part STREQUAL "0x212") + set(TARGET_ARCHITECTURE "pxa210a") + elseif(_cpu_part STREQUAL "0x242") + set(TARGET_ARCHITECTURE "i80321-400") + elseif(_cpu_part STREQUAL "0x243") + set(TARGET_ARCHITECTURE "i80321-600") + elseif(_cpu_part STREQUAL "0x290") + set(TARGET_ARCHITECTURE "pxa250b") + elseif(_cpu_part STREQUAL "0x292") + set(TARGET_ARCHITECTURE "pxa210b") + elseif(_cpu_part STREQUAL "0x2c2") + set(TARGET_ARCHITECTURE "i80321-400-b0") + elseif(_cpu_part STREQUAL "0x2c3") + set(TARGET_ARCHITECTURE "i80321-600-b0") + elseif(_cpu_part STREQUAL "0x2d0") + set(TARGET_ARCHITECTURE "pxa250c") + elseif(_cpu_part STREQUAL "0x2d2") + set(TARGET_ARCHITECTURE "pxa210c") + elseif(_cpu_part STREQUAL "0x411") + set(TARGET_ARCHITECTURE "pxa27x") + elseif(_cpu_part STREQUAL "0x41c") + set(TARGET_ARCHITECTURE "ipx425-533") + elseif(_cpu_part STREQUAL "0x41d") + set(TARGET_ARCHITECTURE "ipx425-400") + elseif(_cpu_part STREQUAL "0x41f") + set(TARGET_ARCHITECTURE "ipx425-266") + elseif(_cpu_part STREQUAL "0x682") + set(TARGET_ARCHITECTURE "pxa32x") + elseif(_cpu_part STREQUAL "0x683") + set(TARGET_ARCHITECTURE "pxa930") + elseif(_cpu_part STREQUAL "0x688") + set(TARGET_ARCHITECTURE "pxa30x") + elseif(_cpu_part STREQUAL "0x689") + set(TARGET_ARCHITECTURE "pxa31x") + elseif(_cpu_part STREQUAL "0xb11") + set(TARGET_ARCHITECTURE "sa1110") + elseif(_cpu_part STREQUAL "0xc12") + set(TARGET_ARCHITECTURE "ipx1200") + endif() + + # Phytium + elseif(_cpu_implementer STREQUAL "0x70") + if(_cpu_part STREQUAL "0x662") + set(TARGET_ARCHITECTURE "ftc662") + elseif(_cpu_part STREQUAL "0x663") + set(TARGET_ARCHITECTURE "ftc663") + endif() + + # Ampere + elseif(_cpu_implementer STREQUAL "0xc0") + + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 + if( _cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) + set(TARGET_ARCHITECTURE "apple-a6") + elseif(_cpu_part STREQUAL "0x37a09642" OR _cpu_part STREQUAL "933271106") # Cyclone (A7) + set(TARGET_ARCHITECTURE "apple-a7") + elseif(_cpu_part STREQUAL "0x2c91a47e" OR _cpu_part STREQUAL "747742334") # Typhoon (A8) + set(TARGET_ARCHITECTURE "apple-a8") + elseif(_cpu_part STREQUAL "0x92fb37c8" OR _cpu_part STREQUAL "2465937352") # Twister (A9) + set(TARGET_ARCHITECTURE "apple-a9") + elseif(_cpu_part STREQUAL "0x67ceee93" OR _cpu_part STREQUAL "1741614739") # Hurrican (A10) + set(TARGET_ARCHITECTURE "apple-a10") + elseif(_cpu_part STREQUAL "0xe81e7ef6" OR _cpu_part STREQUAL "3894312694") # Monsoon Mistral (A11) + set(TARGET_ARCHITECTURE "apple-a11") + elseif(_cpu_part STREQUAL "0x07d34b9f" OR _cpu_part STREQUAL "131287967") # Vortex Tempest (A12) + set(TARGET_ARCHITECTURE "apple-a12") + elseif(_cpu_part STREQUAL "0x462504d2" OR _cpu_part STREQUAL "1176831186") # Lightning Thunder (A13) + set(TARGET_ARCHITECTURE "apple-a13") + elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (A14 / M1 / M1 Pro / M1 Max) + set(TARGET_ARCHITECTURE "apple-m1") + elseif(_cpu_part STREQUAL "0xda33d83d" OR _cpu_part STREQUAL "3660830781") # Blizzard Avalanche (A15) + endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "CPU implementer: ${_cpu_implementer}") + message(STATUS "CPU architecture: ${_cpu_architecture}") + message(STATUS "CPU variant: ${_cpu_variant}") + message(STATUS "CPU part: ${_cpu_part}") + message(STATUS "CPU revision: ${_cpu_revision}") + endif() +endmacro(OFA_AutodetectArm) diff --git a/cmake/ofa/AutodetectPpc.cmake b/cmake/ofa/AutodetectPpc.cmake new file mode 100644 index 0000000000..4e66d1e7e8 --- /dev/null +++ b/cmake/ofa/AutodetectPpc.cmake @@ -0,0 +1,55 @@ +#============================================================================= +# Autodetection of PPC / PPC64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectPpc) + set(_cpu) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*cpu[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu "${_cpuinfo}") + if(_cpu STREQUAL "POWER3") + set(TARGET_ARCHITECTURE "power3") + elseif(_cpu STREQUAL "POWER4") + set(TARGET_ARCHITECTURE "power4") + elseif(_cpu STREQUAL "POWER5") + set(TARGET_ARCHITECTURE "power5") + elseif(_cpu STREQUAL "POWER5+") + set(TARGET_ARCHITECTURE "power5+") + elseif(_cpu STREQUAL "POWER6") + set(TARGET_ARCHITECTURE "power6") + elseif(_cpu STREQUAL "POWER6X") + set(TARGET_ARCHITECTURE "power6x") + elseif(_cpu STREQUAL "POWER7") + set(TARGET_ARCHITECTURE "power7") + elseif(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") + set(TARGET_ARCHITECTURE "power8") + elseif(_cpu STREQUAL "POWER9" OR _cpu STREQUAL "POWER9NVL") + set(TARGET_ARCHITECTURE "power9") + elseif(_cpu STREQUAL "POWER10" OR _cpu STREQUAL "POWER10NVL") + set(TARGET_ARCHITECTURE "power10") + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + endif() + + # TODO: AIX, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "CPU: ${_cpu}") + endif() +endmacro(OFA_AutodetectPpc) diff --git a/cmake/ofa/AutodetectX86.cmake b/cmake/ofa/AutodetectX86.cmake new file mode 100644 index 0000000000..814fede324 --- /dev/null +++ b/cmake/ofa/AutodetectX86.cmake @@ -0,0 +1,368 @@ +#============================================================================= +# Autodetection of X86 / X86_64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectX86) + set(_vendor_id) + set(_cpu_family) + set(_cpu_model) + set(_cpu_stepping) + + # Get CPUID from system + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") + string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") + string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") + string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") + string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + + # macOS + exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _vendor_id) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_stepping) + list(GET _sysctl_output 4 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + else() + # Apple Silicon (ARM64) running in Rosetta 2 mode + # + # The regular detection mechanism for macOS-x64_86 does not work + # because the emulated CPU does not provide the required + # information via the sysctl command. We therefore generate fake + # vendor, model, and stepping information based on the + # macOS-specific CPU codes. + exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + + # Fake vendor + if(_cpu_implementer STREQUAL "0x7" OR _cpu_implementer STREQUAL "7") + set(_vendor_id "GenuineIntel") + else() + set(_vendor_id "Unknown") + endif() + + # Fake stepping + set(_cpu_stepping "Unknown") + + # Fake model + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + if( _cpu_model STREQUAL "0x78ea4fbc" OR _cpu_model STREQUAL "2028621756") # Penryn + set(_cpu_model "23") + elseif(_cpu_model STREQUAL "0x6b5a4cd2" OR _cpu_model STREQUAL "1801080018") # Nehalem + set(_cpu_model "26") + elseif(_cpu_model STREQUAL "0x573b5eec" OR _cpu_model STREQUAL "1463508716") # Westmere + set(_cpu_model "37") + elseif(_cpu_model STREQUAL "0x5490b78c" OR _cpu_model STREQUAL "1418770316") # Sandybridge + set(_cpu_model "42") + elseif(_cpu_model STREQUAL "0x1f65e835" OR _cpu_model STREQUAL "526772277") # Ivybridge + set(_cpu_model "58") + elseif(_cpu_model STREQUAL "0x10b282dc" OR _cpu_model STREQUAL "280134364") # Haswell + set(_cpu_model "60") + elseif(_cpu_model STREQUAL "0x582ed09c" OR _cpu_model STREQUAL "1479463068") # Broadwell + set(_cpu_model "61") + elseif(_cpu_model STREQUAL "0x37fc219f" OR _cpu_model STREQUAL "939270559") # Skylake + set(_cpu_model "78") + elseif(_cpu_model STREQUAL "0x0f817246" OR _cpu_model STREQUAL "260141638") # Kabylake + set(_cpu_model "142") + elseif(_cpu_model STREQUAL "0x38435547" OR _cpu_model STREQUAL "943936839") # Icelake + set(_cpu_model "125") + elseif(_cpu_model STREQUAL "0x1cf8a03e" OR _cpu_model STREQUAL "486055998") # Cometlake + set(_cpu_model "142") + else() + set(_cpu_model "Unknown") + endif() + endif() + endif() + if(_error) + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + endif() + + elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + + # Windows + get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) + get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) + mark_as_advanced(_vendor_id _cpu_id) + string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") + string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") + string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") + + else() + # Try to retrieve CPUID directly + try_run(_exit _ok + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/ofa/cpuinfo_x86.cxx + RUN_OUTPUT_VARIABLE _cpuinfo) + + if(_ok AND ${_exit} EQUAL 0) + string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") + string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") + string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") + string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") + string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + endif() + + # Determine CPU from CPUID + if(_vendor_id STREQUAL "GenuineIntel") + if(_cpu_family EQUAL 6) + # taken from the Intel ORM + # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html + # CPUID Signature Values of Of Recent Intel Microarchitectures + # 4E 5E | Skylake microarchitecture + # 3D 47 56 | Broadwell microarchitecture + # 3C 45 46 3F | Haswell microarchitecture + # 3A 3E | Ivy Bridge microarchitecture + # 2A 2D | Sandy Bridge microarchitecture + # 25 2C 2F | Intel microarchitecture Westmere + # 1A 1E 1F 2E | Intel microarchitecture Nehalem + # 17 1D | Enhanced Intel Core microarchitecture + # 0F | Intel Core microarchitecture + # + # Intel SDM Vol. 3C 35-1 / December 2016: + # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing] + # 85 | Future Xeon Phi + # 8E 9E | 7th gen. Core [Kaby Lake] + # 55 | Future Xeon [Skylake w/ AVX512] + # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512] + # 56 | Xeon D-1500 [Broadwell] + # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell] + # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell] + # 3D | M-5xxx / 5th gen. [Broadwell] + # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E] + # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell] + # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E] + # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge] + # 2D | Xeon E5, i7-39xx [Sandy Bridge] + # 2F | Xeon E7 + # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge] + # 2E | Xeon 7500, 6500 series + # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3 + # + # Values from the Intel SDE: + # 5C | Goldmont + # 5A | Silvermont + # 57 | Knights Landing + # 66 | Cannonlake + # 55 | Skylake Server + # 4E | Skylake Client + # 3C | Broadwell (likely a bug in the SDE) + # 3C | Haswell + # + # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid + + # MIC architecture + if(_cpu_model EQUAL 133) + set(TARGET_ARCHITECTURE "knm") # Knights Mill + + elseif(_cpu_model EQUAL 87) + set(TARGET_ARCHITECTURE "knl") # Knights Landing + + # Small cores + elseif(_cpu_model EQUAL 134) + set(TARGET_ARCHITECTURE "tremont") + + elseif(_cpu_model EQUAL 122) + set(TARGET_ARCHITECTURE "goldmont-plus") + + elseif(_cpu_model EQUAL 92 OR _cpu_model EQUAL 95) + set(TARGET_ARCHITECTURE "goldmont") + + elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 93 OR _cpu_model EQUAL 74 OR _cpu_model EQUAL 76 OR _cpu_model EQUAL 77 OR _cpu_model EQUAL 55) + set(TARGET_ARCHITECTURE "silvermont") + + elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) + set(TARGET_ARCHITECTURE "bonnell") + + # Big cores + elseif(_cpu_model EQUAL 167) + set(TARGET_ARCHITECTURE "rocketlake") + + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) + set(TARGET_ARCHITECTURE "alderlake") + + elseif(_cpu_model EQUAL 143) + set(TARGET_ARCHITECTURE "sapphirerapids") + + elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) + set(TARGET_ARCHITECTURE "kabylake") + + elseif(_cpu_model EQUAL 140) + set(TARGET_ARCHITECTURE "tigerlake") + + elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) + set(TARGET_ARCHITECTURE "icelake") + + elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) + set(TARGET_ARCHITECTURE "icelake-avx512") + + elseif(_cpu_model EQUAL 102) + set(TARGET_ARCHITECTURE "cannonlake") + + elseif(_cpu_model EQUAL 85) + if(_cpu_stepping LESS 5) + set(TARGET_ARCHITECTURE "skylake-avx512") + elseif(_cpu_stepping LESS 8) + set(TARGET_ARCHITECTURE "cascadelake") + else() + set(TARGET_ARCHITECTURE "cooperlake") + endif() + + elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) + set(TARGET_ARCHITECTURE "skylake") + + elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) + set(TARGET_ARCHITECTURE "broadwell") + + elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) + set(TARGET_ARCHITECTURE "haswell") + + elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) + set(TARGET_ARCHITECTURE "ivybridge") + + elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) + set(TARGET_ARCHITECTURE "sandybridge") + + elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) + set(TARGET_ARCHITECTURE "westmere") + + elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) + set(TARGET_ARCHITECTURE "nehalem") + + elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) + set(TARGET_ARCHITECTURE "penryn") + + elseif(_cpu_model EQUAL 15 OR _cpu_model EQUAL 22) + set(TARGET_ARCHITECTURE "merom") + + elseif(_cpu_model EQUAL 28) + set(TARGET_ARCHITECTURE "atom") + + elseif(_cpu_model EQUAL 14) + set(TARGET_ARCHITECTURE "core") + + elseif(_cpu_model LESS 14) + message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") + set(TARGET_ARCHITECTURE "generic") + else() + message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") + set(TARGET_ARCHITECTURE "merom") + endif() + + elseif(_cpu_family EQUAL 7) # Itanium (not supported) + message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") + + elseif(_cpu_family EQUAL 15) # NetBurst + list(APPEND _available_vector_units_list "sse" "sse2") + if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead + list(APPEND _available_vector_units_list "sse" "sse2" "sse3") + endif() + + endif() + + elseif(_vendor_id STREQUAL "AuthenticAMD") + # taken from the list of AMD CPU microarchitectures + # https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures + # CPUID Signature Values of Of Recent AMD Microarchitectures + # 05 05h | K6 + # 06 06h | K7 + # 15 0Fh | K8 / Hammer + # 16 10h | K10 + # 17 11h | K8 & K10 "hybrid" + # 18 12h | K10 (Llano) / K12 (ARM based AMD cpu) + # 20 14h | Bobcat + # 21 15h | Bulldozer / Piledriver / Steamroller / Excavator + # 22 16h | Jaguar / Puma + # 23 17h | Zen / Zen+ / Zen 2 + # 24 18h | Hygon Dhyana + # 25 19h | Zen 3 + + if(_cpu_family EQUAL 25) # 19h + set(TARGET_ARCHITECTURE "zen3") + + elseif(_cpu_family EQUAL 24) # 18h + set(TARGET_ARCHITECTURE "zen") + + elseif(_cpu_family EQUAL 23) # 17h + if(_cpu_model LESS 49) + set(TARGET_ARCHITECTURE "zen") + else() + set(TARGET_ARCHITECTURE "zen2") + endif() + + elseif(_cpu_family EQUAL 22) # 16h + set(TARGET_ARCHITECTURE "amd16h") + + elseif(_cpu_family EQUAL 21) # 15h + if(_cpu_model LESS 16) + set(TARGET_ARCHITECTURE "bulldozer") + elseif(_cpu_model LESS 32) + set(TARGET_ARCHITECTURE "piledriver") + elseif(_cpu_model LESS 64) + set(TARGET_ARCHITECTURE "steamroller") + else() + set(TARGET_ARCHITECTURE "excavator") + endif() + + elseif(_cpu_family EQUAL 20) # 14h + set(TARGET_ARCHITECTURE "amd14h") + + elseif(_cpu_family EQUAL 18) # 12h (K10 / K12) + + elseif(_cpu_family EQUAL 17) # 12h (K8 & K10 hybrid) + + elseif(_cpu_family EQUAL 16) # 10h (K10) + set(TARGET_ARCHITECTURE "barcelona") + + elseif(_cpu_family EQUAL 15) # 0Fh (K8 / Hammer) + if(_cpu_model LESS 39) + set(TARGET_ARCHITECTURE "k8") + else() + set(TARGET_ARCHITECTURE "k8-sse3") + endif() + + elseif(_cpu_family EQUAL 6) # 06h (K7) + elseif(_cpu_family EQUAL 5) # 05h (K6) + + endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "Vendor id: ${_vendor_id}") + message(STATUS "CPU family: ${_cpu_family}") + message(STATUS "CPU mode: ${_cpu_model}") + message(STATUS "CPU stepping: ${_cpu_stepping}") + endif() +endmacro(OFA_AutodetectX86) diff --git a/cmake/CheckCCompilerFlag.cmake b/cmake/ofa/CheckCCompilerFlag.cmake similarity index 100% rename from cmake/CheckCCompilerFlag.cmake rename to cmake/ofa/CheckCCompilerFlag.cmake diff --git a/cmake/CheckCXXCompilerFlag.cmake b/cmake/ofa/CheckCXXCompilerFlag.cmake similarity index 98% rename from cmake/CheckCXXCompilerFlag.cmake rename to cmake/ofa/CheckCXXCompilerFlag.cmake index e3b0188a44..1df1559700 100644 --- a/cmake/CheckCXXCompilerFlag.cmake +++ b/cmake/ofa/CheckCXXCompilerFlag.cmake @@ -47,8 +47,8 @@ MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() - SET(TEST_SOURCE "int main() { return 0;}") - endif() + SET(TEST_SOURCE "int main() { return 0; }") + endif() CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU diff --git a/cmake/CheckMicCCompilerFlag.cmake b/cmake/ofa/CheckMicCCompilerFlag.cmake similarity index 100% rename from cmake/CheckMicCCompilerFlag.cmake rename to cmake/ofa/CheckMicCCompilerFlag.cmake diff --git a/cmake/CheckMicCXXCompilerFlag.cmake b/cmake/ofa/CheckMicCXXCompilerFlag.cmake similarity index 100% rename from cmake/CheckMicCXXCompilerFlag.cmake rename to cmake/ofa/CheckMicCXXCompilerFlag.cmake diff --git a/cmake/ofa/ChecksArm.txt b/cmake/ofa/ChecksArm.txt new file mode 100644 index 0000000000..cd68fa3294 --- /dev/null +++ b/cmake/ofa/ChecksArm.txt @@ -0,0 +1,175 @@ +# List of arm/arm64 checks + +# FORMAT: +# [,];;;;[] +# +# lines starting with # are comments +# lines starting with push_enable: start a block of tests enabled for the given compilers only +# lines starting with pop_enable: ends a block of tests enabled for the given compilers only +# lines starting with push_disable: start a block of tests disabled for the given compilers +# lines starting with pop_disable: ends a block of tests disabled for the given compilers + +# DESCRIPTION: +# For each line of this file, HandleArmOptions generates the code snipped +# +# #include +# #include +# ... +# int main { +# name(parameter0, parameter1, ...); +# return 0; +# } +# +# and compiles it with, e.g. +# +# gcc -m -m +# +# if the extension should be enabled and +# +# gcc -m-no -m-no +# +# if the extension should be disabled. In the above example, the +# compiler name 'gcc' and the flag prefixes '-m' and '-mno-' will be +# set properly by HandleX86Options. +# +# EXTENSION ALIAS: +# By default, it is assumed that the name of the extension, e.g., +# avx512f coinsides with the name of the compiler flag to be used to +# enable/disable it, e.g., -mno-avx512f. Some compilers like Oracle's +# SunPro have non-canonical naming conventions, +# cf. https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html. +# +# In this case, the optional parameter can be used +# to specify the name of the extension as reported by the system, +# whereas the compiler-specific extension flag(s) are given in +# and [], respectively. +# +# ENABLING/DISABLING OF CHECKS: +# Checks can be explicitly disabled for particular compilers by placing +# them inside a push_disable/pop_disable block, e.g. +# +# push_disable:SunPro,IntelLLVM +# +# pop_disable:SunPro +# +# Similarly, checks can be explicitly enabled for particular compilers +# by placing them inside a push_disable/pop_disable block, e.g. +# +# push_enable:SunPro +# +# pop_enable:SunPro + +# ARM (aarch32) 32-bit + +# armv4 : no options +# armv4t : no options + +# armv5t : no options +# armv5te : no options +# armv5tej : no options + +# armv6 : fp nofp vfpv2 +# armv6j : fp nofp vfpv2 +# armv6k : fp nofp vfpv2 +# armv6z : fp nofp vfpv2 +# armv6kz : fp nofp vfpv2 +# armv6zk : fp nofp vfpv2 +# armv6t2 : fp nofp vfpv2 +# armv6-m : no options +# armv6s-m : no options +fp;arm_neon.h;vcvt_f16_f32;float32x4_t() +vfpv2;cstdlib;exit;0 + +# armv7 : fp nofp vfpv3-d16 +vfpv3-d16;cstdlib;exit;0;vfpv3_d16 + +# armv7-a : mp sec fp vfpv3 vfpv3-d16-fp16 vfpv3-fp16 vfpv4-d16 vfpv4 simd +# neon-fp16 neon-vfpv4 nosimd nofp vfpv3-d16 neon neon-vfpv3 +# armv7ve : vfpv3-d16 vfpv3 vfpv3-d16-fp16 vfpv3-fp16 fp vfpv4 neon neon-fp16 +# simd nosimd nofp vfpv4-d16 neon-vfpv3 neon-vfpv4 +mp;cstdlib;exit;0 +neon;cstdlib;exit;0 +neon-fp16;cstdlib;exit;0;neon_fp16 +neon-vfpv3;cstdlib;exit;0;neon_vfpv3 +neon-vfpv4;cstdlib;exit;0;neon_vfpv4 +sec;cstdlib;exit;0 +simd;cstdlib;exit;0 +vfpv3;cstdlib;exit;0 +vfpv3-d16-fp16;cstdlib;exit;0;vfpv3_d16_fp16 +vfpv3-fp16;cstdlib;exit;0;vfpv3_fp16 +vfpv4;cstdlib;exit;0 +vfpv4-d16;cstdlib;exit;0;vfpv4_d16 + +# armv7-r : fp.sp fp vfpv3xd-fp16 vfpv3-d16-fp16 idiv nofp noidiv vfpv3xd vfpv3-d16 +fp.sp;cstdlib;exit;0;fp_sp +fp.dp;cstdlib;exit;0;fp_dp +idiv;cstdlib;exit;0 +vfpv3dx;cstdlib;exit;0 +vfpv3dx-fp16;cstdlib;exit;0;vfpv3dx_fp16 + +# armv7-m : no options +# armv7e-m : fp fpv5 fp.dp nofp vfpv4-sp-d16 fpv5-d16 +fpv5;cstdlib;exit;0 +fpv5_d16;cstdlib;exit;0 +vfpv4-sp-d16;cstdlib;exit;0;vfpv4_sp_d16 + +# armv8-a : crc simd crypto nocrypto nofp sb predres +crc;arm_acle.h;__crc32b;(uint32_t)0,(uint8_t)0 +crypto;arm_neon.h;vaesdq_u8;uint8x16_t(), uint8x16_t() +sb;cstdlib;exit;0 +predres;cstdlib;exit;0 + +# armv8-r : crc fp.sp simd crypto nocrypto nofp +# armv8.1-a : simd crypto nocrypto nofp sb predres +# armv8.2-a : simd fp16 fp16fml crypto nocrypto nofp dotprod sb predres i8mm bf16 +bf16,sve;arm_sve.h;svbfdot;svfloat32_t(),svbfloat16_t(),svbfloat16_t() +dotprod;arm_neon.h;svdot;svint32_t(),svint8_t(),svint8_t() +fp16;arm_neon.h;vabdq_f16;float16x8_t(),float16x8_t() +fp16fml;arm_neon.h;vfmlalq_high_f16;float32x4_t(),float16x8_t(),float16x8_t() +i8mm,sve;arm_sve.h;svmmla;svint32_t(),svint8_t(),svint8_t() +simd;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() + +# armv8.3-a : simd fp16 fp16fml crypto nocrypto nofp dotprod sb predres i8mm bf16 +# armv8.4-a : simd fp16 crypto nocrypto nofp sb predres i8mm bf16 +# armv8.5-a : simd fp16 crypto nocrypto nofp i8mm bf16 +# armv8.6-a : simd fp16 crypto nocrypto nofp i8mm bf16 + +# ARM64 (aarch64) 64-bit + +# armv8.x-a : fp simd crypto crc lse fp16 rcpc rdma dotprod aes sha2 sha3 sm4 fp16fml sve profile rng memtag sb ssbs predres sve2 sve2-sm4 sve2-aes sve2-sha3 sve2-bitperm tme i8mm f32mm f64mm bf16 flagm pauth asimd crc32 +crc32;arm_acle.h;__crc32b;(uint32_t)0,(uint8_t)0 +simd;cstdlib;exit;0;asimd +aes,crypto;arm_neon.h;vaesdq_u8;uint8x16_t(), uint8x16_t() +dsp,sve;arm_sve.h;svqadd_z;svbool_t(),svint8_t(),svint8_t() +f32mm,sve;arm_sve.h;svmmla;svfloat32_t(),svfloat32_t(),svfloat32_t() +f64mm,sve;arm_sve.h;svmmla;svfloat64_t(),svfloat64_t(),svfloat64_t() +flagm;cstdlib;exit;0 +lse;cstdlib;exit;0 +memtag;cstdlib;exit;0 +mve;cstdlib;exit;0 +mve_fp;cstdlib;exit;0 +pauth;cstdlib;exit;0 +profile;cstdlib;exit;0 +ras;cstdlib;exit;0 +rcpc;cstdlib;exit;0 +rdm;cstdlib;exit;0 +rdma;cstdlib;exit;0 +rng;cstdlib;exit;0 +sec;cstdlib;exit;0 +sha2,crypto;arm_neon.h;vsha256hq_u32;uint32x4_t(),uint32x4_t(),uint32x4_t() +sha3;arm_neon.h;vsha512hq_u64;uint64x2_t(),uint64x2_t(),uint64x2_t() +sm4;arm_neon.h;vsm4eq_u32;uint32x4_t(), uint32x4_t() +ssbs;cstdlib;exit;0 +tme;cstdlib;exit;0 +zcm;cstdlib;exit;0 +zcz;cstdlib;exit;0 + +# SVE +sve;arm_sve.h;svwhilelt_b64;0,1 + +# SVE2 +sve2;arm_sve.h;svaba;svint8_t(),svint8_t(),svint8_t() +sve2-aes;arm_sve.h;svaesd;svuint8_t(),svuint8_t() +sve2-bitperm;arm_sve.h;svbdep;svuint8_t(),svuint8_t() +sve2-sha3;arm_sve.h;svrax1;svint64_t(),svint64_t() +sve2-sm4;arm_sve.h;svsm4e;svuint32_t(),svuint32_t() diff --git a/cmake/ofa/ChecksX86.txt b/cmake/ofa/ChecksX86.txt new file mode 100644 index 0000000000..44467c4b28 --- /dev/null +++ b/cmake/ofa/ChecksX86.txt @@ -0,0 +1,217 @@ +# List of x86/x86_64 checks + +# FORMAT: +# [,];;;;[] +# +# lines starting with # are comments +# lines starting with push_enable: start a block of tests enabled for the given compilers only +# lines starting with pop_enable: ends a block of tests enabled for the given compilers only +# lines starting with push_disable: start a block of tests disabled for the given compilers +# lines starting with pop_disable: ends a block of tests disabled for the given compilers + +# DESCRIPTION: +# For each line of this file, HandleX86Options generates the code snipped +# +# #include +# #include +# ... +# int main { +# name(parameter0, parameter1, ...); +# return 0; +# } +# +# and compiles it with, e.g. +# +# gcc -m -m +# +# if the extension should be enabled and +# +# gcc -m-no -m-no +# +# if the extension should be disabled. In the above example, the +# compiler name 'gcc' and the flag prefixes '-m' and '-mno-' will be +# set properly by HandleX86Options. +# +# EXTENSION ALIAS: +# By default, it is assumed that the name of the extension, e.g., +# avx512f coinsides with the name of the compiler flag to be used to +# enable/disable it, e.g., -mno-avx512f. Some compilers like Oracle's +# SunPro have non-canonical naming conventions, +# cf. https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html. +# +# In this case, the optional parameter can be used +# to specify the name of the extension as reported by the system, +# whereas the compiler-specific extension flag(s) are given in +# and [], respectively. +# +# ENABLING/DISABLING OF CHECKS: +# Checks can be explicitly disabled for particular compilers by placing +# them inside a push_disable/pop_disable block, e.g. +# +# push_disable:MSVC,SunPro +# +# pop_disable:MSVC,SunPro +# +# Similarly, checks can be explicitly enabled for particular compilers +# by placing them inside a push_disable/pop_disable block, e.g. +# +# push_enable:SunPro +# +# pop_enable:SunPro + +# MSVC and Oracle's SunPro compiler fail these checks +push_disable:MSVC,SunPro + +# MMX +mmx;mmintrin.h;_mm_add_pi16;__m64(),__m64() + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +avx;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd() +avx2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256() +fma;immintrin.h;_mm_fmadd_pd;_mm_setzero_pd(),_mm_setzero_pd(),_mm_setzero_pd() +sse2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128() +sse3;pmmintrin.h;_mm_addsub_pd;_mm_setzero_pd(),_mm_setzero_pd() +sse4.1;smmintrin.h;_mm_max_epi32;_mm_setzero_si128(),_mm_setzero_si128() +sse4.2;nmmintrin.h;_mm_cmpgt_epi64;_mm_setzero_si128(),_mm_setzero_si128() +sse4a;ammintrin.h;_mm_extract_si64;_mm_setzero_si128(),_mm_setzero_si128() +sse;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps() +ssse3;tmmintrin.h;_mm_hadd_epi16;_mm_setzero_si128(),_mm_setzero_si128() + +# AVX-VNNI +avxvnni;immintrin.h;_mm_dpbusd_avx_epi32;_mm_setzero_si128(),_mm_setzero_si128(),_mm_setzero_si128() + +# AVX-512 +avx5124fmaps;immintrin.h;_mm_4fmadd_ss;_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),new __m128[1] +avx5124vnniw;immintrin.h;_mm512_4dpwssd_epi32;_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),new __m128i[1] +avx512bf16,avx512vl;immintrin.h;_mm_cvtne2ps_pbh;_mm_setzero_ps(),_mm_setzero_ps() +avx512bitalg,avx512vl;immintrin.h;_mm_popcnt_epi16;_mm_setzero_si128() +avx512bw;immintrin.h;_mm512_abs_epi16;_mm512_setzero_si512() +avx512cd;immintrin.h;_mm512_broadcastmb_epi64;__mmask8() +avx512dq;immintrin.h;_mm512_and_pd;_mm512_setzero_pd(),_mm512_setzero_pd() +avx512er;immintrin.h;_mm512_exp2a23_pd;_mm512_setzero_pd() +avx512f;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512() +avx512fp16,avx512vl;immintrin.h;_mm_add_ph;_mm_setzero_ph(),_mm_setzero_ph() +avx512ifma;immintrin.h;_mm512_maskz_madd52hi_epu64;__mmask8(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512() +avx512pf;immintrin.h;_mm512_prefetch_i32scatter_pd;NULL,_mm256_setzero_si256(),(int)1,_MM_HINT_T0 +avx512vbmi2,avx512vl;immintrin.h;_mm_mask_compress_epi16;_mm_setzero_si128(),__mmask8(),_mm_setzero_si128() +avx512vbmi;immintrin.h;_mm512_permutex2var_epi8;_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512() +avx512vl,avx512f;immintrin.h;_mm_abs_epi64;_mm_setzero_si128() +avx512vnni,avx512vl;immintrin.h;_mm_dpbusd_epi32;_mm_setzero_si128(),_mm_setzero_si128(),_mm_setzero_si128() +avx512vp2intersect,avx512vl;immintrin.h;_mm_2intersect_epi32;_mm_setzero_si128(),_mm_setzero_si128(),new __mmask8[1],new __mmask8[1] +avx512vpopcntdq,avx512vl;immintrin.h;_mm_popcnt_epi64;_mm_setzero_si128() + +# AMX +amx-bf16;immintrin.h;_tile_dpbf16ps;0,1,2 +amx-int8;immintrin.h;_tile_dpbssd;0,1,2 +amx-tile;immintrin.h;_tile_zero;0 + +# Other +adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1] +aes;wmmintrin.h;_mm_aesdec_si128;_mm_setzero_si128(),_mm_setzero_si128() +bmi2;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1 +enqcmd;immintrin.h;_enqcmd;(void*)NULL,(void const*)NULL +f16c;immintrin.h;_mm_cvtph_ps;_mm_setzero_si128() +fsgsbase;immintrin.h;_readfsbase_u32; +fxsr;immintrin.h;_fxrstor;(void*)NULL +gfni,avx512vl;immintrin.h;_mm_gf2p8mul_epi8;_mm_setzero_si128(),_mm_setzero_si128() +hreset;immintrin.h;_hreset;1 +invpcid;immintrin.h;_invpcid;(unsigned int)1,(void*)NULL +keylocker;immintrin.h;_mm_aesdec128kl_u8;new __m128i[1],_mm_setzero_si128(),(const void*)NULL +keylocker_wide;immintrin.h;_mm_aesdecwide128kl_u8;new __m128i[1],(const __m128i*)new __m128i[1], (const void*)NULL +lzcnt;immintrin.h;_lzcnt_u32;(unsigned int)1 +monitor;pmmintrin.h;_mm_monitor;(void const*)NULL,(unsigned)1,(unsigned)1 +movbe;immintrin.h;_loadbe_i16;(void const*)NULL +movdir64b;immintrin.h;_movdir64b;(void*)NULL,(const void*)NULL +movdiri;immintrin.h;_directstoreu_u32;(void*)NULL,(unsigned int)1 +mpx;immintrin.h;_bnd_chk_ptr_lbounds;(const void*)NULL +pclmul;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul +pconfig;immintrin.h;_pconfig_u32;(const int)1,new size_t[1] +pku;cstdlib;exit;0 +popcnt;immintrin.h;_mm_popcnt_u32;(unsigned int)1 +prfchw;immintrin.h;_m_prefetchw;(void*)NULL +prefetchwt1;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1 +ptwrite;immintrin.h;_ptwrite32;(unsigned int)0 +rdpid;immintrin.h;_rdpid_u32; +rdrnd;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1] +rdseed;immintrin.h;_rdseed16_step;(unsigned short*)new unsigned short[1] +rdtscp;immintrin.h;__rdtscp;(unsigned int*)NULL +rtm;immintrin.h;_xend; +serialize;immintrin.h;_serialize; +sha;immintrin.h;_mm_sha1msg1_epu32;_mm_setzero_si128(),_mm_setzero_si128() +tsc;immintrin.h;_rdtsc; +tsxldtrk;immintrin.h;_xresldtrk; +uintr;immintrin.h;_clui; +vaes,avx512vl;immintrin.h;_mm256_aesdec_epi128;_mm256_setzero_si256(),_mm256_setzero_si256() +vpclmulqdq,avx512vl;immintrin.h;_mm256_clmulepi64_epi128;_mm256_setzero_si256(),_mm256_setzero_si256(),(const int)1 +waitpkg;immintrin.h;_umonitor;(void*)NULL +wbnoinvd;immintrin.h;_wbnoinvd; +xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned long long)0 +xsaveopt,xsave;immintrin.h;_xsaveopt;(void*)NULL,(unsigned long long)0 +xsaves;immintrin.h;_xgetbv;(unsigned int)1 +xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned long long)0 + +# GNU GCC fails the following tests ... +push_disable:GNU +abm;x86intrin.h;_bextri_u32;(unsigned int)0,(unsigned int)0 +bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 +cldemote;immintrin.h;_mm_cldemote;(void const*)NULL +clflushopt;immintrin.h;_mm_clflushopt;(void const*)NULL +clwb;immintrin.h;_mm_clwb;(void const*)NULL +pop_disable:GNU + +# ... and needs a slightly modified implementation +push_enable:GNU +abm;x86intrin.h;__bextri_u32;(unsigned int)0,(unsigned int)0 +bmi;immintrin.h;__andn_u32;(unsigned int)1,(unsigned int)1 +cldemote;immintrin.h;_cldemote;(void*)NULL +clflushopt;immintrin.h;_mm_clflushopt;(void*)NULL +clwb;immintrin.h;_mm_clwb;(void*)NULL +pop_enable:GNU + +pop_disable:MSVC,SunPro + + +# Special checks for the MSVC compiler +push_enable:MSVC + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +SSE;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps();sse +SSE2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128();sse2 +AVX;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd();avx +AVX2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256();avx2 + +# AVX-512 +AVX512;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512();avx512f + +pop_enable:MSVC + + +# Special checks for Oracle's SunPro compiler +# https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html +push_enable:SunPro + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +avx;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd() +avx2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256() +sse2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128() +sse3;pmmintrin.h;_mm_addsub_pd;_mm_setzero_pd(),_mm_setzero_pd() +sse4_1;smmintrin.h;_mm_max_epi32;_mm_setzero_si128(),_mm_setzero_si128();sse4.1 +sse4_2;nmmintrin.h;_mm_cmpgt_epi64;_mm_setzero_si128(),_mm_setzero_si128();sse4.2 +sse;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps() +ssse3;tmmintrin.h;_mm_hadd_epi16;_mm_setzero_si128(),_mm_setzero_si128() + +# AVX-512 +avx512;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512();avx512f +avx512;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1;prefetchwt1 + +# Other +avx_i;emmintrin.h;_mm_cvtph_ps;_mm_setzero_si128();f16c +aes;wmmintrin.h;_mm_aesdec_si128;_mm_setzero_si128(),_mm_setzero_si128();aes +aes;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul +avx2;immintrin.h;_lzcnt_u32;(unsigned int)1;lzcnt +sse4_2;immintrin.h;_mm_popcnt_u32;(unsigned int)1;popcnt +avx_i;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1;bmi +avx_i;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1;bmi2 +avx_i;immintrin.h;_readfsbase_u32;;fsgsbase +avx_i;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1];rdrnd +pop_enable:SunPro diff --git a/cmake/ofa/CommonMacros.cmake b/cmake/ofa/CommonMacros.cmake new file mode 100644 index 0000000000..72259c73e1 --- /dev/null +++ b/cmake/ofa/CommonMacros.cmake @@ -0,0 +1,8 @@ +macro(_ofa_find _list _value _ret) + list(FIND ${_list} "${_value}" _found) + if(_found EQUAL -1) + set(${_ret} FALSE) + else() + set(${_ret} TRUE) + endif() +endmacro(_ofa_find) diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake new file mode 100644 index 0000000000..e7f23e095a --- /dev/null +++ b/cmake/ofa/HandleArmOptions.cmake @@ -0,0 +1,1078 @@ +#============================================================================= +# Handling of ARM / ARM64 options +# +# This is a three-step process: +# +# 1. Generate a list of available compiler flags for the specific CPU +# +# 2. Enable/disable feature flags based on available CPU features, +# used-defined USE_ variables and the capabilities of the +# host system's compiler and linker +# +# 3. Set compiler-specific flags (e.g., -m/-mno-) +#============================================================================= + +include(ofa/AddCXXCompilerFlag) +include(ofa/CommonMacros) +include(CheckIncludeFileCXX) + +macro(OFA_HandleArmOptions) + + # Special treatment for "native" flag + if(TARGET_ARCHITECTURE STREQUAL "native") + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR "[OFA] MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows) + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + AddCXXCompilerFlag("-mcpu=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + endif() + + if(NOT _ok) + message(FATAL_ERROR "[OFA] An error occured while setting the \"native\" flag.") + endif() + + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Step 1: Generate a list of compiler flags for the specific CPU + set(_march_flag_list) + set(_mtune_flag_list) + set(_available_extension_list) + + # ARM + if(TARGET_ARCHITECTURE STREQUAL "strongarm") + list(APPEND _mtune_flag_list "strongarm") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm8") + list(APPEND _mtune_flag_list "arm8") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm810") + list(APPEND _mtune_flag_list "arm810") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa526") + list(APPEND _mtune_flag_list "fa526") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626") + list(APPEND _mtune_flag_list "fa626") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") + list(APPEND _mtune_flag_list "arm7tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") + list(APPEND _mtune_flag_list "arm7tdmi-s") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") + list(APPEND _mtune_flag_list "arm710t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") + list(APPEND _mtune_flag_list "arm720t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") + list(APPEND _mtune_flag_list "arm740t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9") + list(APPEND _mtune_flag_list "arm9") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") + list(APPEND _mtune_flag_list "arm9tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920") + list(APPEND _mtune_flag_list "arm920") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") + list(APPEND _mtune_flag_list "arm920t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") + list(APPEND _mtune_flag_list "arm922t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") + list(APPEND _mtune_flag_list "arm940t") + list(APPEND _march_flag_list "armv4t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") + list(APPEND _mtune_flag_list "arm1020t") + list(APPEND _march_flag_list "armv5t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") + list(APPEND _mtune_flag_list "arm10tdmi") + list(APPEND _march_flag_list "armv5t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") + list(APPEND _mtune_flag_list "arm9e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") + list(APPEND _mtune_flag_list "arm946e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") + list(APPEND _mtune_flag_list "arm966e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") + list(APPEND _mtune_flag_list "arm968e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") + list(APPEND _mtune_flag_list "arm10e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") + list(APPEND _mtune_flag_list "arm1020e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") + list(APPEND _mtune_flag_list "arm1022e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "xscale") + list(APPEND _mtune_flag_list "xscale") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") + list(APPEND _mtune_flag_list "iwmmxt") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") + list(APPEND _mtune_flag_list "iwmmxt2") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") + list(APPEND _mtune_flag_list "fa606te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") + list(APPEND _mtune_flag_list "fa626te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") + list(APPEND _mtune_flag_list "fmp626") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") + list(APPEND _mtune_flag_list "fa726te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") + list(APPEND _mtune_flag_list "arm926ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") + list(APPEND _mtune_flag_list "arm1026ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_extension_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") + list(APPEND _mtune_flag_list "mpcore") + list(APPEND _march_flag_list "armv6k") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") + list(APPEND _mtune_flag_list "arm1136j-s") + list(APPEND _march_flag_list "armv6j") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") + list(APPEND _mtune_flag_list "arm1136jf-s") + list(APPEND _march_flag_list "armv6j") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") + list(APPEND _mtune_flag_list "arm1156t2-s") + list(APPEND _march_flag_list "armv6t2") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") + list(APPEND _mtune_flag_list "arm1156t2f-s") + list(APPEND _march_flag_list "armv6t2") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") + list(APPEND _mtune_flag_list "arm1176jz-s") + list(APPEND _march_flag_list "armv6kz") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") + list(APPEND _mtune_flag_list "arm1176jzf-s") + list(APPEND _march_flag_list "armv6kz") + list(APPEND _available_extension_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") + list(APPEND _mtune_flag_list "generic-armv7-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") + list(APPEND _mtune_flag_list "cortex-a5") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") + list(APPEND _mtune_flag_list "cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") + list(APPEND _mtune_flag_list "cortex-a8") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "sec" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") + list(APPEND _mtune_flag_list "cortex-a9") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") + list(APPEND _mtune_flag_list "cortex-a12") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") + list(APPEND _mtune_flag_list "cortex-a15") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") + list(APPEND _mtune_flag_list "cortex-a17") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") + list(APPEND _mtune_flag_list "cortex-a32") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") + list(APPEND _mtune_flag_list "cortex-a34") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") + list(APPEND _mtune_flag_list "cortex-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") + list(APPEND _mtune_flag_list "cortex-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") + list(APPEND _mtune_flag_list "cortex-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") + list(APPEND _mtune_flag_list "cortex-a57") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") + list(APPEND _mtune_flag_list "cortex-a72") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") + list(APPEND _mtune_flag_list "cortex-a73") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") + list(APPEND _mtune_flag_list "cortex-a75") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") + list(APPEND _mtune_flag_list "cortex-a76") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") + list(APPEND _mtune_flag_list "cortex-a76ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") + list(APPEND _mtune_flag_list "cortex-a77") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") + list(APPEND _mtune_flag_list "cortex-a78ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") + list(APPEND _mtune_flag_list "cortex-a78c") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a510") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a710") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") + list(APPEND _mtune_flag_list "cortex-m0") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") + list(APPEND _mtune_flag_list "cortex-m0plus") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") + list(APPEND _mtune_flag_list "cortex-m1") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") + list(APPEND _mtune_flag_list "cortex-m3") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") + list(APPEND _mtune_flag_list "cortex-m4") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") + list(APPEND _mtune_flag_list "cortex-m7") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_extension_list "fp.dp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") + list(APPEND _mtune_flag_list "cortex-m23") + list(APPEND _march_flag_list "armv8-m.base") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") + list(APPEND _mtune_flag_list "cortex-m33") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") + list(APPEND _mtune_flag_list "cortex-m35p") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") + list(APPEND _mtune_flag_list "cortex-m55") + list(APPEND _march_flag_list "armv8.1-m.main") + list(APPEND _march_flag_list "armv8-m") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "mve.fp" "fp.dp") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") + list(APPEND _mtune_flag_list "cortex-r4") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") + list(APPEND _mtune_flag_list "cortex-r4f") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") + list(APPEND _mtune_flag_list "cortex-r5") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "idiv" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") + list(APPEND _mtune_flag_list "cortex-r7") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") + list(APPEND _mtune_flag_list "cortex-r8") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") + list(APPEND _mtune_flag_list "cortex-r52") + list(APPEND _march_flag_list "armv8-r") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") + list(APPEND _mtune_flag_list "cortex-x1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") + list(APPEND _mtune_flag_list "neoverse-e1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") + list(APPEND _mtune_flag_list "neoverse-n1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") + list(APPEND _mtune_flag_list "neoverse-n2") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") + list(APPEND _mtune_flag_list "neoverse-v1") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + # Broadcom + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") + list(APPEND _mtune_flag_list "brahma-b15") + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") + list(APPEND _mtune_flag_list "brahma-b53") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") + list(APPEND _mtune_flag_list "thunderx2") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crypto") + + # Cavium + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") + list(APPEND _mtune_flag_list "thunderx") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") + list(APPEND _mtune_flag_list "thunderxt88") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") + list(APPEND _mtune_flag_list "thunderxt81") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") + list(APPEND _mtune_flag_list "thunderxt83") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") + list(APPEND _mtune_flag_list "thunderx2t99") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + + # DEC + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") + list(APPEND _mtune_flag_list "strongarm110") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") + list(APPEND _mtune_flag_list "strongarm1100") + list(APPEND _march_flag_list "armv4") + + # FUJITSU + elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") + list(APPEND _mtune_flag_list "a64fx") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "sve") + + # HiSilicon + elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") + list(APPEND _mtune_flag_list "tsv110") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "aes" "crypto" "fp16" "sha2") + + # Nvidia + elseif(TARGET_ARCHITECTURE STREQUAL "denver") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "denver2") + list(APPEND _mtune_flag_list "denver2") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "carmel") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # APM + elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") + list(APPEND _mtune_flag_list "xgene1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + + # Qualcomm + elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") + list(APPEND _mtune_flag_list "scorpion") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "krait") + list(APPEND _mtune_flag_list "krait") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo") + list(APPEND _mtune_flag_list "kryo") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") + list(APPEND _mtune_flag_list "kryo2") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "falkor") + list(APPEND _mtune_flag_list "falkor") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "saphira") + list(APPEND _mtune_flag_list "saphira") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # Samsung + elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") + list(APPEND _mtune_flag_list "exynos-m1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crypto" "simd") + + # Marvell + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") + list(APPEND _mtune_flag_list "marvell-f") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") + list(APPEND _mtune_flag_list "marvell-pj4") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "fp") + + # Intel + elseif(TARGET_ARCHITECTURE STREQUAL "i80200") + list(APPEND _mtune_flag_list "i80200") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") + list(APPEND _mtune_flag_list "pxa250a") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") + list(APPEND _mtune_flag_list "pxa210a") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") + list(APPEND _mtune_flag_list "i80321-400") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") + list(APPEND _mtune_flag_list "i80321-600") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") + list(APPEND _mtune_flag_list "pxa250b") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") + list(APPEND _mtune_flag_list "pxa210b") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") + list(APPEND _mtune_flag_list "i80321-400-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") + list(APPEND _mtune_flag_list "i80321-600-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") + list(APPEND _mtune_flag_list "pxa250c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") + list(APPEND _mtune_flag_list "pxa210c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") + list(APPEND _mtune_flag_list "pxa27x") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") + list(APPEND _mtune_flag_list "ipx425-533") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") + list(APPEND _mtune_flag_list "ipx425-400") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") + list(APPEND _mtune_flag_list "ipx425-266") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") + list(APPEND _mtune_flag_list "pxa32x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") + list(APPEND _mtune_flag_list "pxa930") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") + list(APPEND _mtune_flag_list "pxa30x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") + list(APPEND _mtune_flag_list "pxa31x") + elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") + list(APPEND _mtune_flag_list "sa1110") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") + list(APPEND _mtune_flag_list "ipx1200") + + # Apple + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") + list(APPEND _mtune_flag_list "apple-a6") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") + list(APPEND _mtune_flag_list "apple-a7") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") + list(APPEND _mtune_flag_list "apple-a8") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") + list(APPEND _mtune_flag_list "apple-a9") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") + list(APPEND _mtune_flag_list "apple-a10") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "simd" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") + list(APPEND _mtune_flag_list "apple-a11") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "lse" "simd" "ras" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") + list(APPEND _mtune_flag_list "apple-a12") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") + list(APPEND _mtune_flag_list "apple-a13") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") + list(APPEND _mtune_flag_list "vortex") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + # Clean list of available extensions + list(SORT _available_extension_list) + list(REMOVE_DUPLICATES _available_extension_list) + + if(OFA_VERBOSE) + if(_march_flag_list) + string(REPLACE ";" ", " _str "${_march_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU architectures (-march): " ${_str}) + endif() + if(_mtune_flag_list) + string(REPLACE ";" ", " _str "${_mtune_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU microarchitectures (-mtune): " ${_str}) + endif() + if(_available_extension_list) + list(LENGTH _available_extension_list _len) + string(REPLACE ";" ", " _str "${_available_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} available): ${_str}") + endif() + endif() + + # Following the recommendation from + # https://community.arm.com/developer/tools-software/ + # tools/b/tools-software-ides-blog/posts/ + # compiler-flags-across-architectures-march-mtune-and-mcpu we + # first try to use the -mcpu flag and set it a value from the + # list of -mtune flags. If that fails, e.g., if the compiler + # does not yet support the specified target, we try to set the + # -march and -mtune flags as fallback option. + + # Set compiler-specific option names + set(_mcpu_flag "-mcpu=") + set(_march_flag "-march=") + set(_mtune_flag "-mtune") + + # foreach(_flag ${_mtune_flag_list}) + # AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + # if(_ok) + # break() + # endif() + # endforeach() + + if(NOT _ok) + # Fallback: set -march and -mtune flags + set(_check_extension_list) + set(_check_extension_flag_list) + set(_disable_extension_flag_list) + set(_enable_extension_flag_list) + set(_ignore_extension_flag_list) + + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("${_march_flag}${_flag}" RESULT _ok) + if(_ok) + set(_march ${_flag}) + break() + endif() + endforeach() + + # Step 2: Enable/disable feature flags based on available CPU + # features, used-defined USE_ variables and + # the capabilities of the host system's compiler and linker + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksArm.txt _checks) + string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") + string(REPLACE "\n" ";" _checks "${_checks}") + + set(_skip_check FALSE) + + # Iterate over the list of checks line by line + foreach (_check ${_checks}) + string(REPLACE "|" ";" _check "${_check}") + + # Parse for special lines + if ("${_check}" MATCHES "^#" ) # Skip comment + continue() + + elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block + list(GET _check 1 _push_enable_list) + string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") + _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(INSERT _skip_check 0 FALSE) + else() + list(INSERT _skip_check 0 TRUE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block + list(REMOVE_AT _skip_check 0) + continue() + + elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block + list(GET _check 1 _push_disable_list) + string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") + _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(INSERT _skip_check 0 TRUE) + else() + # Compiler was not found in the list, so we keep its previous status + list(GET _skip_check 0 _skip) + list(INSERT _skip_check 0 ${_skip}) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block + list(REMOVE_AT _skip_check 0) + continue() + endif() + + # Skip test? + list(GET _skip_check 0 _skip) + if(_skip) + continue() + endif() + + # Extract extra CPU extensions, header files, function name, and parameters + list(GET _check 0 _check_extension_flags) + list(GET _check 1 _check_headers) + list(GET _check 2 _check_function) + list(GET _check 3 _check_params) + + # Convert list of extensions into compiler flags + string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") + list(GET _check_extension_flags 0 _extension_flag) + list(APPEND _check_extension_flag_list "${_extension_flag}") + string(REPLACE ";" "+" _check_extra_flags "+${_check_extension_flags}") + + # Extract optional extension alias + list(LENGTH _check _len) + if(${_len} EQUAL 5) + list(GET _check 4 _extension) + else() + set(_extension "${_extension_flag}") + endif() + + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension_flag> variable + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") + + # If not specified externally, set the value of the + # USE_<_extension_flag> variable to TRUE if it is found in the list + # of available extensions and FALSE otherwise + if(NOT DEFINED ${_useVar}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(${_useVar} ${_found}) + endif() + + if(${_useVar}) + # Check if the compiler supports the -march=<_march>+<_extension_flag> + # flag and can compile the provided test code with it + set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") + AddCXXCompilerFlag("${_march_flag}${_march}+${_extension_flag}" + EXTRA_FLAGS ${_check_extra_flags} + HEADERS ${_check_headers} + CODE "${_code}" + RESULT _ok) + if(NOT ${_ok}) + # Test failed + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + else() + # Test succeeded + set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") + endif() + else() + # Disable extension without running tests + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + endif() + mark_as_advanced(${_useVar}) + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_flag_list) + foreach(_extension_flag ${_check_extension_flag_list}) + _ofa_find(_available_extension_list "${_extension_flag}" _found) + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") + + if(${_useVar}) + # Add <_extension_flag> to list of enabled extensions (if supported) + set(_haveVar "HAVE_${_march_flag}${_march}+${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_march_flag}${_march}+${_extension_flag} because checks failed") + endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + continue() + endif() + list(APPEND _enable_extension_flag_list "${_extension_flag}") + else() + # Add <_extension_flag> to list of disabled extensions (if supported) + AddCXXCompilerFlag("${_march_flag}${_march}+no${_extension_flag}") + set(_haveVar "HAVE_${_march_flag}${_march}+no${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_march_flag}${_march}+no${_extension_flag} because checks failed") + endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + continue() + endif() + list(APPEND _disable_extension_flag_list "${_extension_flag}") + endif() + endforeach() + + if(OFA_VERBOSE) + # Print checked extension flags + if(_check_extension_flag_list) + list(LENGTH _check_extension_flag_list _len) + list(SORT _check_extension_flag_list) + string(REPLACE ";" ", " _str "${_check_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} checked): ${_str}") + endif() + # Print enabled extension flags + if(_enable_extension_flag_list) + list(LENGTH _enable_extension_flag_list _len) + list(SORT _enable_extension_flag_list) + string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} enabled): ${_str}") + endif() + # Print disabled extension flags + if(_disable_extension_flag_list) + list(LENGTH _disable_extension_flag_list _len) + list(SORT _disable_extension_flag_list) + string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} disabled): ${_str}") + endif() + # Print ignored extension flags + if(_ignore_extension_flag_list) + list(LENGTH _ignore_extension_flag_list _len) + list(SORT _ignore_extension_flag_list) + string(REPLACE ";" ", " _str "${_ignore_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} ignored): ${_str}") + endif() + # Print unhandled extension flags + set(_unhandled_extension_list) + foreach(_extension ${_available_extension_list}) + _ofa_find(_check_extension_list "${_extension}" _found) + if(NOT _found) + list(APPEND _unhandled_extension_list ${_extension}) + endif() + endforeach() + if(_unhandled_extension_list) + list(LENGTH _unhandled_extension_list _len) + list(SORT _unhandled_extension_list) + string(REPLACE ";" ", " _str "${_unhandled_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} unhandled): ${_str}") + endif() + endif() + + # Step 3: Set compiler-specific flags (e.g., -m/-mno-) + if(MSVC AND MSVC_VERSION GREATER 1900) + _ofa_find(_enable_extension_flag_list "vfpv4" _found) + if(_found) + AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "simd" _found) + if(_found) + AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + foreach(_flag ${_enable_extension_flag_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "[-.+/:= ]" "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + + # TODO: Add Cray flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") + + # TODO: Add Fujitsu flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + # TODO: Add NVHPC flags + + else() + # Others: GNU, Clang and variants + foreach(_march ${_march_flag_list}) + AddCXXCompilerFlag("-march=${_march}" RESULT _ok) + if(_ok) + set(_march_plus_extensions "${_march}") + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-march=${_march_plus_extensions}+${_flag}" RESULT _ok) + if(_ok) + set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") + endif(_ok) + endforeach() + foreach(_flag ${_disable_extension_flag_list}) + AddCXXCompilerFlag("-march=${_march_plus_extensions}+no${_flag}" RESULT _ok) + if(_ok) + set(_march_plus_extensions "${_march_plus_extensions}+no${_flag}") + endif(_ok) + endforeach() + AddCXXCompilerFlag("-march=${_march_plus_extensions}" FLAGS OFA_ARCHITECTURE_FLAGS) + break() + endif() + endforeach() + + # Set -mtune flag + foreach(_mtune ${_mtune_flag_list}) + AddCXXCompilerFlag("-mtune=${_mtune}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + if(_ok) + break() + endif() + endforeach() + endif() + + endif() + endif() + + # Compile code with profiling instrumentation + if(TARGET_PROFILER STREQUAL "gprof") + AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() +endmacro(OFA_HandleArmOptions) diff --git a/cmake/ofa/HandlePpcOptions.cmake b/cmake/ofa/HandlePpcOptions.cmake new file mode 100644 index 0000000000..19b46f3b38 --- /dev/null +++ b/cmake/ofa/HandlePpcOptions.cmake @@ -0,0 +1,140 @@ +#============================================================================= +# Handling of PPC / PPC64 options +# +# This is a two-step process: +# +# 1. Generate a list of compiler flags for the specific CPU +# +# 2. Special compiler-specific treatment of "native" flag +# +# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options +# +# 4. Set compiler-specific flags +#============================================================================= + +include(ofa/AddCompilerFlag) +include(ofa/CommonMacros) +include(CheckIncludeFileCXX) + +macro(OFA_HandlePpcOptions) + set(_march_flag_list) + set(_available_vector_units_list) + + # Define macros for PowerPC64 + macro(_power3) + list(APPEND _march_flag_list "power3") + endmacro() + macro(_power4) + list(APPEND _march_flag_list "power4") + _power3() + endmacro() + macro(_power5) + list(APPEND _march_flag_list "power5") + _power4() + endmacro() + macro(_power5plus) + list(APPEND _march_flag_list "power5+") + _power5() + endmacro() + macro(_power6) + list(APPEND _march_flag_list "power6") + _power5() + endmacro() + macro(_power6x) + list(APPEND _march_flag_list "power6x") + _power6() + endmacro() + macro(_power7) + list(APPEND _march_flag_list "power7") + _power6() + endmacro() + macro(_power8) + list(APPEND _march_flag_list "power8") + list(APPEND _march_flag_list "pwr8") + _power7() + endmacro() + macro(_power9) + list(APPEND _march_flag_list "power9") + list(APPEND _march_flag_list "pwr9") + _power8() + endmacro() + macro(_power10) + list(APPEND _march_flag_list "power10") + list(APPEND _march_flag_list "pwr10") + _power9() + endmacro() + + # PowerPC64 + if(TARGET_ARCHITECTURE STREQUAL "power3") + _power3() + elseif(TARGET_ARCHITECTURE STREQUAL "power4") + _power4() + elseif(TARGET_ARCHITECTURE STREQUAL "power5") + _power5() + elseif(TARGET_ARCHITECTURE STREQUAL "power5+") + _power5plus() + elseif(TARGET_ARCHITECTURE STREQUAL "power6") + _power6() + elseif(TARGET_ARCHITECTURE STREQUAL "power6x") + _power6x() + elseif(TARGET_ARCHITECTURE STREQUAL "power7") + _power7() + elseif(TARGET_ARCHITECTURE STREQUAL "power8") + _power8() + elseif(TARGET_ARCHITECTURE STREQUAL "power9") + _power9() + elseif(TARGET_ARCHITECTURE STREQUAL "power10") + _power10() + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "native") + list(APPEND _march_flag_list "native") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + # Special treatment for "native" + if(TARGET_ARCHITECTURE STREQUAL "native") + + # Apply architecture flags + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _ofa_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + + # Add compiler flags + if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "XL") + + else() + # Others: GNU, Clang and variants + + + endif() + endif() +endmacro(OFA_HandlePpcOptions) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake new file mode 100644 index 0000000000..d0c875e249 --- /dev/null +++ b/cmake/ofa/HandleX86Options.cmake @@ -0,0 +1,790 @@ +#============================================================================= +# Handling of X86 / X86_64 options +# +# This is a three-step process: +# +# 1. Generate a list of available compiler flags for the specific CPU +# +# 2. Enable/disable feature flags based on available CPU features, +# used-defined USE_ variables and the capabilities of the +# host system's compiler and linker +# +# 3. Set compiler-specific flags (e.g., -m/-mno-) +#============================================================================= + +include(ofa/AddCXXCompilerFlag) +include(ofa/CommonMacros) +include(CheckIncludeFileCXX) + +macro(OFA_HandleX86Options) + + # Special treatment for "native" flag + if(TARGET_ARCHITECTURE STREQUAL "native") + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR "[OFA] MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + if(WIN32) + # Intel (on Windows) + AddCXXCompilerFlag("/QxHOST" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + else() + # Intel (on Linux) + AddCXXCompilerFlag("-xHOST" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows) + AddCXXCompilerFlag("-tp=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + # Sun/Oracle Studio (on Linux/Sun OS) + AddCXXCompilerFlag("-native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + AddCXXCompilerFlag("-march=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + endif() + + if(NOT _ok) + message(FATAL_ERROR "[OFA] An error occured while setting the \"native\" flag.") + endif() + + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Step 1: Generate a list of compiler flags for the specific CPU + set(_march_flag_list) + set(_available_extension_list) + + # Define macros for Intel + macro(_nehalem) + list(APPEND _march_flag_list "nehalem") + list(APPEND _march_flag_list "corei7") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") + endmacro() + macro(_westmere) + list(APPEND _march_flag_list "westmere") + _nehalem() + list(APPEND _available_extension_list "aes" "pclmul") + endmacro() + macro(_sandybridge) + list(APPEND _march_flag_list "sandybridge") + list(APPEND _march_flag_list "corei7-avx") + _westmere() + list(APPEND _available_extension_list "avx") + endmacro() + macro(_ivybridge) + list(APPEND _march_flag_list "ivybridge") + list(APPEND _march_flag_list "core-avx-i") + _sandybridge() + list(APPEND _available_extension_list "rdrnd" "f16c" "fsgsbase") + endmacro() + macro(_haswell) + list(APPEND _march_flag_list "haswell") + list(APPEND _march_flag_list "core-avx2") + _ivybridge() + list(APPEND _available_extension_list "abm" "avx2" "fma" "bmi" "bmi2") + endmacro() + macro(_broadwell) + list(APPEND _march_flag_list "broadwell") + _haswell() + list(APPEND _available_extension_list "rdseed" "adx" "prfchw") + endmacro() + macro(_skylake) + list(APPEND _march_flag_list "skylake") + _broadwell() + list(APPEND _available_extension_list "clflushopt" "xsavec" "xsaves") + endmacro() + macro(_skylake_avx512) + list(APPEND _march_flag_list "skylake-avx512") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") + endmacro() + macro(_cascadelake) + list(APPEND _march_flag_list "cascadelake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512vnni") + endmacro() + macro(_cooperlake) + list(APPEND _march_flag_list "cooperlake") + _cascadelake() + list(APPEND _available_extension_list "avx512bf16") + endmacro() + macro(_cannonlake) + list(APPEND _march_flag_list "cannonlake") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") + endmacro() + macro(_icelake) + list(APPEND _march_flag_list "icelake-client") + _cannonlake() + list(APPEND _available_extension_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") + endmacro() + macro(_icelake_avx512) + list(APPEND _march_flag_list "icelake-server") + _icelake() + list(APPEND _available_extension_list "pconfig" "wbnoinvd") + endmacro() + macro(_tigerlake) + list(APPEND _march_flag_list "tigerlake") + _icelake() + list(APPEND _available_extension_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") + endmacro() + macro(_alderlake) + list(APPEND _march_flag_list "alderlake") + _broadwell() + list(APPEND _available_extension_list "avxvnni" "cldemote" "clwb" "gfni" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") + endmacro() + macro(_sapphirerapids) + list(APPEND _march_flag_list "sapphirerapids") + _skylake_avx512() + list(APPEND _available_extension_list "amx-bf16" "amx-int8" "amx-tile" "avxvnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") + endmacro() + macro(_rocketlake) + list(APPEND _march_flag_list "rocketlake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") + endmacro() + macro(_knightslanding) + list(APPEND _march_flag_list "knl") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd") + endmacro() + macro(_knightsmill) + list(APPEND _march_flag_list "knm") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") + endmacro() + macro(_silvermont) + list(APPEND _march_flag_list "silvermont") + _westmere() + list(APPEND _available_extension_list "rdrnd") + endmacro() + macro(_goldmont) + list(APPEND _march_flag_list "goldmont") + _silvermont() + list(APPEND _available_extension_list "rdseed") + endmacro() + macro(_goldmont_plus) + list(APPEND _march_flag_list "goldmont-plus") + _goldmont() + list(APPEND _available_extension_list "rdpid") + endmacro() + macro(_tremont) + list(APPEND _march_flag_list "tremont") + _goldmont_plus() + endmacro() + + # Define macros for AMD + macro(_k8) + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "mmx" "3dnow" "sse" "sse2") + endmacro() + macro(_k8_sse3) + list(APPEND _march_flag_list "k8-sse3") + _k8() + list(APPEND _available_extension_list "sse3") + endmacro() + macro(_barcelona) # amd10h + list(APPEND _march_flag_list "barcelona") + _k8_sse3() + list(APPEND _available_extension_list "sse4a" "abm") + endmacro() + macro(_amd14h) + list(APPEND _march_flag_list "btver1") + _barcelona() + list(APPEND _available_extension_list "cx16" "ssse3") + endmacro() + macro(_bulldozer) # amd15h + list(APPEND _march_flag_list "bdver1") + _amd14h() + list(APPEND _available_extension_list "sse4.1" "sse4.2" "avx" "xop" "fma4" "lwp" "aes" "pclmul") + endmacro() + macro(_piledriver) + list(APPEND _march_flag_list "bdver2") + _bulldozer() + list(APPEND _available_extension_list "fma" "f16c" "bmi" "tbm") + endmacro() + macro(_steamroller) + list(APPEND _march_flag_list "bdver3") + _piledriver() + list(APPEND _available_extension_list "fsgsbase") + endmacro() + macro(_excavator) + list(APPEND _march_flag_list "bdver4") + _steamroller() + list(APPEND _available_extension_list "bmi2" "avx2" "movbe") + endmacro() + macro(_amd16h) + list(APPEND _march_flag_list "btver2") + _amd14h() + list(APPEND _available_extension_list "movbe" "sse4.1" "sse4.2" "avx" "f16c" "bmi" "pclmul" "aes") + endmacro() + macro(_zen) + list(APPEND _march_flag_list "znver1") + _amd16h() + list(APPEND _available_extension_list "bmi2" "fma" "fsgsbase" "avx2" "adcx" "rdseed" "mwaitx" "sha" "clzero" "xsavec" "xsaves" "clflushopt" "popcnt") + endmacro() + macro(_zen2) + list(APPEND _march_flag_list "znver2") + _zen() + list(APPEND _available_extension_list "clwb" "rdpid" "wbnoinvd") + endmacro() + macro(_zen3) + list(APPEND _march_flag_list "znver3") + _zen3() + list(APPEND _available_extension_list "pku" "vpclmulqdq" "vaes") + endmacro() + + # Intel + if(TARGET_ARCHITECTURE STREQUAL "core" OR TARGET_ARCHITECTURE STREQUAL "core2") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3") + elseif(TARGET_ARCHITECTURE STREQUAL "merom") + list(APPEND _march_flag_list "merom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "penryn") + list(APPEND _march_flag_list "penryn") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3") + message(STATUS "[OFA] Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") + if(_cpu_flags MATCHES "sse4_1") + message(STATUS "[OFA] SSE4.1: enabled (auto-detected from this computer's CPU flags)") + list(APPEND _available_extension_list "sse4.1") + else() + message(STATUS "[OFA] SSE4.1: disabled (auto-detected from this computer's CPU flags)") + endif() + elseif(TARGET_ARCHITECTURE STREQUAL "knm") + _knightsmill() + elseif(TARGET_ARCHITECTURE STREQUAL "knl") + _knightslanding() + elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") + _rocketlake() + elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") + _sapphirerapids() + elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") + _alderlake() + elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") + _tigerlake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake") + _icelake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") + _icelake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") + _cannonlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") + _cooperlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") + _cascadelake() + elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") + _skylake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") + _broadwell() + elseif(TARGET_ARCHITECTURE STREQUAL "haswell") + _haswell() + elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") + _ivybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") + _sandybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "westmere") + _westmere() + elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") + _nehalem() + elseif(TARGET_ARCHITECTURE STREQUAL "tremont") + _tremont() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") + _goldmont_plus() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") + _goldmont() + elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") + _silvermont() + elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") + list(APPEND _march_flag_list "bonnell") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "atom") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + + # AMD + elseif(TARGET_ARCHITECTURE STREQUAL "k8") + _k8() + elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") + k8_sse3() + elseif(TARGET_ARCHITECTURE STREQUAL "barcelona" OR + TARGET_ARCHITECTURE STREQUAL "istanbul" OR + TARGET_ARCHITECTURE STREQUAL "magny-cours") + _barcelona() + elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") + _amd14h() + elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer" OR + TARGET_ARCHITECTURE STREQUAL "interlagos") + _bulldozer() + elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") + _piledriver() + elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") + _steamroller() + elseif(TARGET_ARCHITECTURE STREQUAL "excavator") + _excavator() + elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") + _amd16h() + elseif(TARGET_ARCHITECTURE STREQUAL "zen") + _zen() + elseif(TARGET_ARCHITECTURE STREQUAL "zen2") + _zen2() + elseif(TARGET_ARCHITECTURE STREQUAL "zen3") + _zen3() + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + list(APPEND _available_extension_list "sse") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + # Clean list of available extensions + list(SORT _available_extension_list) + list(REMOVE_DUPLICATES _available_extension_list) + + if(OFA_VERBOSE) + if(_march_flag_list) + string(REPLACE ";" ", " _str "${_march_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU architectures: " ${_str}) + endif() + if(_available_extension_list) + list(LENGTH _available_extension_list _len) + string(REPLACE ";" ", " _str "${_available_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} available): ${_str}") + endif() + endif() + + set(_check_extension_list) + set(_check_extension_flag_list) + set(_disable_extension_flag_list) + set(_enable_extension_flag_list) + set(_ignore_extension_flag_list) + + # Set compiler-specific option names + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(_enable_flag "/arch:") + unset(_disable) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + set(_enable_flag "-xarch=") + unset(_disable_flag) + else() + set(_enable_flag "-m") + set(_disable_flag "-mno-") + endif() + + # Step 2: Enable/disable feature flags based on available CPU + # features, used-defined USE_ variables and + # the capabilities of the host system's compiler and linker + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksX86.txt _checks) + string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") + string(REPLACE "\n" ";" _checks "${_checks}") + + set(_skip_check FALSE) + + # Iterate over the list of checks line by line + foreach (_check ${_checks}) + string(REPLACE "|" ";" _check "${_check}") + + # Parse for special lines + if ("${_check}" MATCHES "^#" ) # Skip comment + continue() + + elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block + list(GET _check 1 _push_enable_list) + string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") + _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(INSERT _skip_check 0 FALSE) + else() + list(INSERT _skip_check 0 TRUE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block + list(REMOVE_AT _skip_check 0) + continue() + + elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block + list(GET _check 1 _push_disable_list) + string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") + _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(INSERT _skip_check 0 TRUE) + else() + # Compiler was not found in the list, so we keep its previous status + list(GET _skip_check 0 _skip) + list(INSERT _skip_check 0 ${_skip}) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block + list(REMOVE_AT _skip_check 0) + continue() + endif() + + # Skip test? + list(GET _skip_check 0 _skip) + if(_skip) + continue() + endif() + + # Extract extra CPU extensions, header files, function name, and parameters + list(GET _check 0 _check_extension_flags) + list(GET _check 1 _check_headers) + list(GET _check 2 _check_function) + list(GET _check 3 _check_params) + + # Convert list of extensions into compiler flags + string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") + list(GET _check_extension_flags 0 _extension_flag) + list(APPEND _check_extension_flag_list "${_extension_flag}") + string(REPLACE ";" " ${_enable_flag}" _check_extra_flags " ${_enable_flag}${_check_extension_flags}") + + # Extract optional extension alias + list(LENGTH _check _len) + if(${_len} EQUAL 5) + list(GET _check 4 _extension) + else() + set(_extension "${_extension_flag}") + endif() + + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension_flag> variable + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") + + # If not specified externally, set the value of the + # USE_<_extension_flag> variable to TRUE if it is found in the list + # of available extensions and FALSE otherwise + if(NOT DEFINED ${_useVar}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(${_useVar} ${_found}) + endif() + + if(${_useVar}) + # Check if the compiler supports the -m<_extension_flag> + # flag and can compile the provided test code with it + set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") + AddCXXCompilerFlag("${_enable_flag}${_extension_flag}" + EXTRA_FLAGS ${_check_extra_flags} + HEADERS ${_check_headers} + CODE "${_code}" + RESULT _ok) + if(NOT ${_ok}) + # Test failed + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + else() + # Test succeeded + set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") + endif() + else() + # Disable extension without running tests + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + endif() + mark_as_advanced(${_useVar}) + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_flag_list) + foreach(_extension_flag ${_check_extension_flag_list}) + _ofa_find(_available_extension_list "${_extension_flag}" _found) + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") + + if(${_useVar}) + # Add <_extension_flag> to list of enabled extensions (if supported) + set(_haveVar "HAVE_${_enable_flag}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_enable_flag}${_extension_flag} because checks failed") + endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + continue() + endif() + list(APPEND _enable_extension_flag_list "${_extension_flag}") + elseif(DEFINED _disable_flag) + # Add <_extension_flag> to list of disabled extensions (if supported) + AddCXXCompilerFlag("${_disable_flag}${_extension_flag}") + set(_haveVar "HAVE_${_disable_flag}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_disable_flag}${_extension_flag} because checks failed") + endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + continue() + endif() + list(APPEND _disable_extension_flag_list "${_extension_flag}") + else() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + endif() + endforeach() + + if(OFA_VERBOSE) + # Print checked extension flags + if(_check_extension_flag_list) + list(LENGTH _check_extension_flag_list _len) + list(SORT _check_extension_flag_list) + string(REPLACE ";" ", " _str "${_check_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} checked): ${_str}") + endif() + # Print enabled extension flags + if(_enable_extension_flag_list) + list(LENGTH _enable_extension_flag_list _len) + list(SORT _enable_extension_flag_list) + string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} enabled): ${_str}") + endif() + # Print disabled extension flags + if(_disable_extension_flag_list) + list(LENGTH _disable_extension_flag_list _len) + list(SORT _disable_extension_flag_list) + string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} disabled): ${_str}") + endif() + # Print ignored extension flags + if(_ignore_extension_flag_list) + list(LENGTH _ignore_extension_flag_list _len) + list(SORT _ignore_extension_flag_list) + string(REPLACE ";" ", " _str "${_ignore_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} ignored): ${_str}") + endif() + # Print unhandled extension flags + set(_unhandled_extension_list) + foreach(_extension ${_available_extension_list}) + _ofa_find(_check_extension_list "${_extension}" _found) + if(NOT _found) + list(APPEND _unhandled_extension_list ${_extension}) + endif() + endforeach() + if(_unhandled_extension_list) + list(LENGTH _unhandled_extension_list _len) + list(SORT _unhandled_extension_list) + string(REPLACE ";" ", " _str "${_unhandled_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} unhandled): ${_str}") + endif() + endif() + + # Step 3: Set compiler-specific flags (e.g., -m/-mno-) + if(MSVC AND MSVC_VERSION GREATER 1700) + _ofa_find(_enable_extension_flag_list "avx512f" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX512" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "avx2" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX2" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) + endif() + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "avx" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) + endif() + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "sse2" _found) + if(_found) + AddCXXCompilerFlag("/arch:SSE2" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "sse" _found) + if(_found) + AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + foreach(_extension ${_enable_extension_flag_list}) + string(TOUPPER "${_extension}" _extension) + string(REPLACE "[-.+/:= ]" "_" _extension "__${_extension}__") + add_definitions("-D${_extension}") + endforeach(_extension) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") + set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") + set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") + set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") + set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") + set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") + set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") + set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") + set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xKNL;-xMIC-AVX512") + set(OFA_map_knm "-xKNM;-xMIC-AVX512") + set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") + set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") + set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") + set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") + set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") + set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") + set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") + set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") + set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") + set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") + set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") + set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() + + foreach(_arch ${_march_flag_list}) + if(DEFINED OFA_map_${_arch}) + foreach(_flag ${OFA_map_${_arch}}) + AddCXXCompilerFlag(${_flag} FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + if(_ok) + break() + endif() + endforeach() + if(_ok) + break() + endif() + endif() + endforeach() + if(NOT _ok) + # This is the Intel compiler, so SSE2 is a very reasonable baseline. + message(STATUS "[OFA] Did not recognize the requested architecture flag ${_arch}, falling back to SSE2") + if(WIN32) + AddCXXCompilerFlag("-QxSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCXXCompilerFlag("-xSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + + # Set -m<_extension> flag for enabled features + foreach(_extension ${_enable_extension_flag_list}) + AddCXXCompilerFlag("${_enable_flag}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_extension) + + # Set -mno-<_extension> flag for disabled features + if(DEFINED _disable_flag) + foreach(_extension ${_disable_extension_flag_list}) + AddCXXCompilerFlag("${_disable_flag}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_extension) + endif() + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + + # Set -xtarget flag + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("-xtarget=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Set -xarch= flag for enabled features + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-xarch=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + + # TODO PGI/Cray ... + + else() + # Others: GNU, Clang and variants + + # Set -march flag + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("-march=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Set -m flag for enabled features + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + + # Set -mno-feature flag for disabled features + foreach(_flag ${_disable_extension_flag_list}) + AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + endif() + endif() + + # Compile code with profiling instrumentation + if(TARGET_PROFILER STREQUAL "gprof") + AddCXXCompilerFlag("-pg" FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(TARGET_PROFILER STREQUAL "vtune") + if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Need to check if this also works on Windows + AddCXXCompilerFlag("-g" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-debug inline-debug-info" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-D TBB_USE_THREADING_TOOLS" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-parallel-source-info=2" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-gline-tables-only" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-fdebug-info-for-profiling" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-Xsprofile" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + + # Remove duplicate flags + list(REMOVE_DUPLICATES OFA_ARCHITECTURE_FLAGS) + + if(OFA_VERBOSE) + string(REPLACE ";" ", " _str "${OFA_ARCHITECTURE_FLAGS}") + message(STATUS "OFA_ARCHITECTURE_FLAGS: " ${_str}) + endif() + +endmacro(OFA_HandleX86Options) diff --git a/cmake/ofa/cpuinfo_x86.cxx b/cmake/ofa/cpuinfo_x86.cxx new file mode 100644 index 0000000000..b4bd46f2c3 --- /dev/null +++ b/cmake/ofa/cpuinfo_x86.cxx @@ -0,0 +1,711 @@ +#include +#include +#include + +#define print_features(reg,features,n) \ + for (int i=0; i>i & 0x1) && !features[i].empty() \ + ? (features[i]+" ").c_str() : ""); + +// Get the vendor ID +void getVendorID() { + int a[3]; + for(int i=0; i<3; ++i) + a[i] = 0; + + // EAX=0x00000000: Vendor ID + __asm__("mov $0x00000000, %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); + __asm__("mov %%edx, %0\n\t":"=r" (a[1])); + __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); + + char vendorID[13]; vendorID[12] = 0; + memcpy(&vendorID[0],&a[0],4); + memcpy(&vendorID[4],&a[1],4); + memcpy(&vendorID[8],&a[2],4); + + printf ("vendor_id : %s\n", vendorID); +} + +// Get processor information +void getProcInfo() { + int eax = 0; + + // EAX=0x00000001: Processor Info + __asm__("mov $0x00000001 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //gives model and family + + int stepping = eax>>0 & 0xF; + int model = eax>>4 & 0xF; + int family = eax>>8 & 0xF; + if(family == 6 || family == 15) + model += (eax>>16 & 0xF)<<4; + + printf ("cpu family : %d\n", family); + printf ("model : %d\n", model); + printf ("stepping : %d\n", stepping); +} + +// Get processor features +void getFeatures() { + int eax_max,ecx_max,eax,ebx,ecx,edx; + + // Note: If the comment begins with a quoted string, that string is + // used in /proc/cpuinfo instead of the macro name. If the string is + // "", this feature bit is not displayed in /proc/cpuinfo at all. + + // CPU flags + printf ("flags : "); + + // EAX=0x00000000: largest value that EAX can be set to before calling CPUID + __asm__("mov $0x00000000, %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); + + if (eax_max >= 0x00000001) { + + // EAX=0x00000001: Processor Info and Feature Bits + __asm__("mov $0x00000001 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + + // Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 + { + std::string features[] = { "fpu", /* Onboard FPU */ + "vme", /* Virtual Mode Extensions */ + "de", /* Debugging Extensions */ + "pse", /* Page Size Extensions */ + "tsc", /* Time Stamp Counter */ + "msr", /* Model-Specific Registers */ + "pae", /* Physical Address Extensions */ + "mce", /* Machine Check Exception */ + "cx8", /* CMPXCHG8 instruction */ + "apic", /* Onboard APIC */ + "", /* Reserved */ + "sep", /* SYSENTER/SYSEXIT */ + "mtrr", /* Memory Type Range Registers */ + "pge", /* Page Global Enable */ + "mca", /* Machine Check Architecture */ + "cmov", /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + "pat", /* Page Attribute Table */ + "pse36", /* 36-bit PSEs */ + "pn", /* Processor serial number */ + "clflush", /* CLFLUSH instruction */ + "", /* Reserved */ + "dts", /* "dts" Debug Store */ + "acpi", /* ACPI via MSR */ + "mmx", /* Multimedia Extensions */ + "fxsr", /* FXSAVE/FXRSTOR, CR4.OSFXSR */ + "sse", /* "sse" */ + "sse2", /* "sse2" */ + "ss", /* "ss" CPU self snoop */ + "ht", /* Hyper-Threading */ + "tm", /* "tm" Automatic clock control */ + "ia64", /* IA-64 processor */ + "pbe" /* Pending Break Enable */ + }; + print_features(edx, features, 32); + } + + // Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 + { + std::string features[] = { "sse3", /* "pni" SSE-3 */ + "pclmulqdq", /* PCLMULQDQ instruction */ + "dtes64", /* 64-bit Debug Store */ + "monitor", /* "monitor" MONITOR/MWAIT support */ + "ds_cpl", /* "ds_cpl" CPL-qualified (filtered) Debug Store */ + "vmx", /* Hardware virtualization */ + "smx", /* Safer Mode eXtensions */ + "est", /* Enhanced SpeedStep */ + "tm2", /* Thermal Monitor 2 */ + "ssse3", /* Supplemental SSE-3 */ + "cid", /* Context ID */ + "sdbg", /* Silicon Debug */ + "fma", /* Fused multiply-add */ + "cx16", /* CMPXCHG16B instruction */ + "xtpr", /* Send Task Priority Messages */ + "pdcm", /* Perf/Debug Capabilities MSR */ + "", /* Reserved */ + "pcid", /* Process Context Identifiers */ + "dca", /* Direct Cache Access */ + "sse4_1", /* "sse4_1" SSE-4.1 */ + "sse4_2", /* "sse4_2" SSE-4.2 */ + "x2apic", /* X2APIC */ + "movbe", /* MOVBE instruction */ + "popcnt", /* POPCNT instruction */ + "tsc_deadline_timer", /* TSC deadline timer */ + "aes", /* AES instructions */ + "xsave", /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ + "", /* "" XSAVE instruction enabled in the OS */ + "avx", /* Advanced Vector Extensions */ + "f16c", /* 16-bit FP conversions */ + "rdrand", /* RDRAND instruction */ + "hypervisor" /* Running on a hypervisor */ + }; + print_features(ecx, features, 32); + } + } // EAX=0x00000001 + + // if (eax_max >=0x00000006) { + // // EAX=0x00000006: Extended Features + // __asm__("mov $0x00000006 , %eax\n\t"); + // __asm__("cpuid\n\t"); + // __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + // __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + // __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + // __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // // Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 + + // { + // std::string features[] = { "cxmmx", /* Cyrix MMX extensions */ + // "k6_mtrr", /* AMD K6 nonstandard MTRRs */ + // "cyrix_arr", /* Cyrix ARRs (= MTRRs) */ + // "centaur_mcr", /* Centaur MCRs (= MTRRs) */ + // "k8", /* "" Opteron, Athlon64 */ + // "", /* "" Athlon */ + // "", /* "" P3 */ + // "", /* "" P4 */ + // "constant_tsc", /* TSC ticks at a constant rate */ + // "up", /* SMP kernel running on UP */ + // "art", /* Always running timer (ART) */ + // "arch_perfmon", /* Intel Architectural PerfMon */ + // "pebs", /* Precise-Event Based Sampling */ + // "bts", /* Branch Trace Store */ + // "", /* "" syscall in IA32 userspace */ + // "", /* "" sysenter in IA32 userspace */ + // "rep_good", /* REP microcode works well */ + // "", /* Reserved */ + // "", /* "" LFENCE synchronizes RDTSC */ + // "acc_power", /* AMD Accumulated Power Mechanism */ + // "nopl", /* The NOPL (0F 1F) instructions */ + // "", /* "" Always-present feature */ + // "xtopology", /* CPU topology enum extensions */ + // "tsc_reliable", /* TSC is known to be reliable */ + // "nonstop_tsc", /* TSC does not stop in C states */ + // "cpuid", /* CPU has CPUID instruction itself */ + // "extd_apicid", /* Extended APICID (8 bits) */ + // "amd_dcm", /* AMD multi-node processor */ + // "aperfmperf", /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ + // "rapl", /* AMD/Hygon RAPL interface */ + // "nonstop_tsc_s3", /* TSC doesn't stop in S3 state */ + // "tsc_known_freq" /* TSC has known frequency */ + // }; + // print_features(ecx, features, 32); + // } + // } // EAX=0x00000006 + + if (eax_max >= 0x00000007) { + // EAX=0x00000007, ECX=0x00000000: Extended Features + __asm__("mov $0x00000007 , %eax\n\t"); + __asm__("mov $0x00000000 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (ecx_max)); //gives maximum ECX value + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 + { + std::string features[] = { "fsgsbase", /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ + "tsc_adjust", /* TSC adjustment MSR 0x3B */ + "sgx", /* Software Guard Extensions */ + "bmi1", /* 1st group bit manipulation extensions */ + "hle", /* Hardware Lock Elision */ + "avx2", /* AVX2 instructions */ + "", /* "" FPU data pointer updated only on x87 exceptions */ + "smep", /* Supervisor Mode Execution Protection */ + "bmi2", /* 2nd group bit manipulation extensions */ + "erms", /* Enhanced REP MOVSB/STOSB instructions */ + "invpcid", /* Invalidate Processor Context ID */ + "rtm", /* Restricted Transactional Memory */ + "cqm", /* Cache QoS Monitoring */ + "", /* "" Zero out FPU CS and FPU DS */ + "mpx", /* Memory Protection Extension */ + "rdt_a", /* Resource Director Technology Allocation */ + "avx512f", /* AVX-512 Foundation */ + "avx512dq", /* AVX-512 DQ (Double/Quad granular) Instructions */ + "rdseed", /* RDSEED instruction */ + "adx", /* ADCX and ADOX instructions */ + "smap", /* Supervisor Mode Access Prevention */ + "avx512ifma", /* AVX-512 Integer Fused Multiply-Add instructions */ + "pcommit", + "clflushopt", /* CLFLUSHOPT instruction */ + "clwb", /* CLWB instruction */ + "intel_pt", /* Intel Processor Trace */ + "avx512pf", /* AVX-512 Prefetch */ + "avx512er", /* AVX-512 Exponential and Reciprocal */ + "avx512cd", /* AVX-512 Conflict Detection */ + "sha_ni", /* SHA1/SHA256 Instruction Extensions */ + "avx512bw", /* AVX-512 BW (Byte/Word granular) Instructions */ + "avx512vl" /* AVX-512 VL (128/256 Vector Length) Extensions */ + }; + print_features(ebx, features, 32); + } + + // Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 + { + std::string features[] = { "prefetchwt1", + "avx512vbmi", /* AVX512 Vector Bit Manipulation instructions*/ + "umip", /* User Mode Instruction Protection */ + "pku", /* Protection Keys for Userspace */ + "ospke", /* OS Protection Keys Enable */ + "waitpkg", /* UMONITOR/UMWAIT/TPAUSE Instructions */ + "avx512vbmi2", /* Additional AVX512 Vector Bit Manipulation Instructions */ + "cetss", + "gfni", /* Galois Field New Instructions */ + "vaes", /* Vector AES */ + "vpclmulqdq", /* Carry-Less Multiplication Double Quadword */ + "avx512vnni", /* Vector Neural Network Instructions */ + "avx512bitalg", /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ + "tme", /* Intel Total Memory Encryption */ + "avx512vpopcntdq", /* POPCNT for vectors of DW/QW */ + "", /* Reserved */ + "la57", /* 5-level page tables */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "rdpid", /* RDPID instruction */ + "keylocker", + "bus_lock_detect", /* Bus Lock detect */ + "cldemote", /* CLDEMOTE instruction */ + "", /* Reserved */ + "movdiri", /* MOVDIRI instruction */ + "movdir64b", /* MOVDIR64B instruction */ + "enqcmd", /* ENQCMD and ENQCMDS instructions */ + "sgx_lc", /* Software Guard Extensions Launch Control */ + "pks" + }; + print_features(ecx, features, 32); + } + + // Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 + { + std::string features[] = { "", /* Reserved */ + "", /* Reserved */ + "avx5124vnniw", /* AVX-512 Neural Network Instructions */ + "avx5124fmaps", /* AVX-512 Multiply Accumulation Single precision */ + "fsrm", /* Fast Short Rep Mov */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "avx512vp2intersect", /* AVX-512 Intersect for D/Q */ + "srbds", /* "" SRBDS mitigation MSR available */ + "md_clear", /* VERW clears CPU buffers */ + "", /* "" RTM transaction always aborts */ + "", /* Reserved */ + "", /* "" TSX_FORCE_ABORT */ + "serialize", /* SERIALIZE instruction */ + "", /* "" This part has CPUs of more than one type */ + "tsxldtrk", /* TSX Suspend Load Address Tracking */ + "", /* Reserved */ + "pconfig", /* Intel PCONFIG */ + "arch_lbr", /* Intel ARCH LBR */ + "cet_ibt", + "", /* Reserved */ + "amx-bf16", /* AMX BFLOAT16 Support */ + "avx512fp16", /* AVX512 FP16 */ + "amx-tile", /* AMX tile Support */ + "amx-int8", /* AMX int8 Support */ + "ibrs ibpb", /* "" Speculation Control (IBRS + IBPB) */ + "stibp", /* "" Single Thread Indirect Branch Predictors */ + "flush_l1d", /* Flush L1D cache */ + "arch_capabilities", /* IA32_ARCH_CAPABILITIES MSR (Intel) */ + "", /* "" IA32_CORE_CAPABILITIES MSR */ + "ssbd" /* "" Speculative Store Bypass Disable */ + }; + print_features(edx, features, 32); + } + + if (ecx_max >= 0x00000001) { + // EAX=0x00000007, ECX=0x00000001: Extended Features + __asm__("mov $0x00000007 , %eax\n\t"); + __asm__("mov $0x00000001 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + + // Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 + { + std::string features[] = { "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "avx_vnni", /* AVX VNNI instructions */ + "avx512bf16", /* AVX512 BFLOAT16 instructions */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(eax, features, 32); + } + } // ECX=0x00000001 + } // EAX=0x00000007 + + if (eax_max >= 0x0000000d) { + // EAX=0x0000000d, ECX=0x00000001: Extended Features + __asm__("mov $0x0000000d , %eax\n\t"); + __asm__("mov $0x00000001 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + + // Intel-defined CPU features, CPUID level 0x0000000d:1 (EAX), word 10 + { + std::string features[] = { "xsaveopt", /* XSAVEOPT instruction */ + "xsavec", /* XSAVEC instruction */ + "xgetbv1", /* XGETBV with ECX = 1 instruction */ + "xsaves", /* XSAVES/XRSTORS instructions */ + "xfd", /* "" eXtended Feature Disabling */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(eax, features, 32); + } + } // EAX=0x0000000d + + // EAX=0x80000000: largest value that EAX can be set to before calling CPUID + __asm__("mov $0x80000000, %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); + + if (eax_max >= 0x80000001) { + + // EAX=80000001: Processor Info and Feature Bits + __asm__("mov $0x80000001 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + + // AMD-defined CPU features, CPUID level 0x80000001 (EDX), word 1 + // Don't duplicate feature flags which are redundant with Intel! + { + std::string features[] = { "", /* Onboard FPU */ + "", /* Virtual Mode Extensions */ + "", /* Debugging Extensions */ + "", /* Page Size Extensions */ + "", /* Time Stamp Counter */ + "", /* Model-Specific Registers */ + "", /* Physical Address Extensions */ + "", /* Machine Check Exception */ + "", /* CMPXCHG8 instruction */ + "", /* Onboard APIC */ + "", /* Reserved */ + "syscall", /* SYSCALL/SYSRET */ + "", /* Memory Type Range Registers */ + "", /* Page Global Enable */ + "", /* Machine Check Architecture */ + "", /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + "", /* Page Attribute Table */ + "", /* 36-bit PSEs */ + "", /* Reserved */ + "mp", /* MP Capable */ + "nx", /* Execute Disable */ + "", /* Reserved */ + "mmxext", /* AMD MMX extensions */ + "", /* Multimedia Extensions */ + "", /* FXSAVE/FXRSTOR, CR4.OSFXSR */ + "fxsr_opt", /* FXSAVE/FXRSTOR optimizations */ + "pdpe1gb", /* "pdpe1gb" GB pages */ + "rdtscp", /* RDTSCP */ + "", /* Reserved */ + "lm", /* Long Mode (x86-64, 64-bit support) */ + "3dnowext", /* AMD 3DNow extensions */ + "3dnow" /* 3DNow */ + }; + print_features(edx, features, 32); + } + + // AMD-defined CPU features, CPUID level 0x80000001 (ECX), word 6 + { + std::string features[] = { "lahf_lm", /* LAHF/SAHF in long mode */ + "cmp_legacy", /* If yes HyperThreading not valid */ + "svm", /* Secure Virtual Machine */ + "extapic", /* Extended APIC space */ + "cr8_legacy", /* CR8 in 32-bit mode */ + "abm", /* Advanced bit manipulation */ + "sse4a", /* SSE-4A */ + "misalignsse", /* Misaligned SSE mode */ + "3dnowprefetch", /* 3DNow prefetch instructions */ + "osvw", /* OS Visible Workaround */ + "ibs", /* Instruction Based Sampling */ + "xop", /* extended AVX instructions */ + "skinit", /* SKINIT/STGI instructions */ + "wdt", /* Watchdog timer */ + "", /* Reserved */ + "lwp", /* Light Weight Profiling */ + "fma4", /* 4 operands MAC instructions */ + "tce", /* Translation Cache Extension */ + "", /* Reserved */ + "nodeid_msr", /* NodeId MSR */ + "", /* Reserved */ + "tbm", /* Trailing Bit Manipulations */ + "topoext", /* Topology extensions CPUID leafs */ + "perfctr_core", /* Core performance counter extensions */ + "perfctr_nb", /* NB performance counter extensions */ + "", /* Reserved */ + "bpext", /* Data breakpoint extension */ + "ptsc", /* Performance time-stamp counter */ + "perfctr_l2", /* Last Level Cache performance counter extensions */ + "mwaitx", /* MWAIT extension (MONITORX/MWAITX instructions) */ + "", /* Reserved */ + "" /* Reserved */ + + }; + print_features(ecx, features, 32); + } + } // EAX=0x80000001 + + if (eax_max >=0x80000007) { + // EAX=0x80000007: Extended Features + __asm__("mov $0x80000007 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 + { + std::string features[] = { "overflow_recov", /* MCA overflow recovery support */ + "succor", /* Uncorrectable error containment and recovery */ + "", /* Reserved */ + "smca", /* Scalable MCA */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(ebx, features, 32); + } + } // EAX=0x80000007 + + if (eax_max >=0x80000008) { + // EAX=0x80000008: Extended Features + __asm__("mov $0x80000008 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 18 + { + std::string features[] = { "clzero", /* CLZERO instruction */ + "irperf", /* Instructions Retired Count */ + "xsaveerptr", /* Always save/restore FP error pointers */ + "", /* Reserved */ + "rdpru", /* Read processor register at user level */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "wbnoinvd", /* WBNOINVD instruction */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" Indirect Branch Prediction Barrier */ + "", /* Reserved */ + "", /* "" Indirect Branch Restricted Speculation */ + "", /* "" Single Thread Indirect Branch Predictors */ + "", /* Reserved */ + "", /* "" Single Thread Indirect Branch Predictors always-on preferred */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "amd_ppin", /* Protected Processor Inventory Number */ + "", /* "" Speculative Store Bypass Disable */ + "virt_ssbd", /* Virtualized Speculative Store Bypass Disable */ + "", /* "" Speculative Store Bypass is fixed in hardware. */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(ebx, features, 32); + } + } // EAX=0x80000008 + + if (eax_max >=0x8000000a) { + // EAX=0x8000000a: Extended Features + __asm__("mov $0x8000000a , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x8000000a (EDX), word 15 + { + std::string features[] = { "npt", /* Nested Page Table support */ + "lbrv", /* LBR Virtualization support */ + "svm_lock", /* "svm_lock" SVM locking MSR */ + "nrip_save", /* "nrip_save" SVM next_rip save */ + "tsc_scale", /* "tsc_scale" TSC scaling support */ + "vmcb_clean", /* "vmcb_clean" VMCB clean bits support */ + "flushbyasid", /* flush-by-ASID support */ + "decodeassists", /* Decode Assists support */ + "", /* Reserved */ + "", /* Reserved */ + "pausefilter", /* filtered pause intercept */ + "", /* Reserved */ + "pfthreshold", /* pause filter threshold */ + "avic", /* Virtual Interrupt Controller */ + "", /* Reserved */ + "v_vmsave_vmload", /* Virtual VMSAVE VMLOAD */ + "vgif", /* Virtual GIF */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "v_spec_ctrl", /* Virtual SPEC_CTRL */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" SVME addr check */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(edx, features, 32); + } + } // EAX=0x8000000a + + if (eax_max >=0x8000001f) { + // EAX=0x8000001f: Extended Features + __asm__("mov $0x8000001f , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x8000001f (EAX), word 19 + { + std::string features[] = { "sme", /* AMD Secure Memory Encryption */ + "sev", /* AMD Secure Encrypted Virtualization */ + "", /* "" VM Page Flush MSR is supported */ + "sev_es", /* AMD Secure Encrypted Virtualization - Encrypted State */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" AMD hardware-enforced cache coherency */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(eax, features, 32); + } + } // EAX=0x8000001f + + printf("\n"); +} + +int main(){ + getVendorID(); + getProcInfo(); + getFeatures(); + return 0; +} diff --git a/doc/Examples.dox b/doc/Examples.dox index 3bef2d1b46..3bb0bd1137 100644 --- a/doc/Examples.dox +++ b/doc/Examples.dox @@ -34,6 +34,8 @@ In the gismo/examples sub-directory we find: - \subpage heatEquation_example +- \subpage heatEquation2_example + - \subpage inputOutput_example - \subpage gsInterpolateMap @@ -54,6 +56,8 @@ In the gismo/examples sub-directory we find: - \subpage multiGrid_example +- \subpage performance_benchmark + - \subpage pMultiGrid_example @cond DOXYGEN_EXCLUDE diff --git a/doc/figs/performance_benchmark_memcopy1.pdf b/doc/figs/performance_benchmark_memcopy1.pdf new file mode 100644 index 0000000000..6084c06fa1 Binary files /dev/null and b/doc/figs/performance_benchmark_memcopy1.pdf differ diff --git a/doc/figs/performance_benchmark_memcopy2.pdf b/doc/figs/performance_benchmark_memcopy2.pdf new file mode 100644 index 0000000000..ecc4473ebb Binary files /dev/null and b/doc/figs/performance_benchmark_memcopy2.pdf differ diff --git a/doc/figs/performance_benchmark_memcopy3.pdf b/doc/figs/performance_benchmark_memcopy3.pdf new file mode 100644 index 0000000000..438e611355 Binary files /dev/null and b/doc/figs/performance_benchmark_memcopy3.pdf differ diff --git a/doc/heatEquation2_example.dox b/doc/heatEquation2_example.dox new file mode 100644 index 0000000000..55a2c90c70 --- /dev/null +++ b/doc/heatEquation2_example.dox @@ -0,0 +1,14 @@ +namespace gismo { + +/** + +\page heatEquation2_example heatEquation2_example.cpp + +Here is the full file \c examples/heatEquation2_example.cpp. Clicking on a function +or class name will lead you to its reference documentation. + +\include heatEquation2_example.cpp + +*/ + +} \ No newline at end of file diff --git a/doc/performance_benchmark.dox b/doc/performance_benchmark.dox new file mode 100644 index 0000000000..9867edecf7 --- /dev/null +++ b/doc/performance_benchmark.dox @@ -0,0 +1,239 @@ +namespace gismo { + +/** + +\page performance_benchmark performance_benchmark.cpp + +The aim of the performance benchmark is to provide a ready-to-run +application to measure the computational performance of G+Smo and its +underlying libraries on your computer with the specific compiler +configuration. It implements a suite of benchmarks that measure the +performance of certain low-level operations such as the computation of +the dot-product between two vectors or the addition of two vectors +(AXPY) as well as high-order operations such as the assembly of system +matrices. The performance benchmark is particularly useful when you +run G+Smo on a new computer architecture (e.g., Apple Silicon M1, IBM +Power10, or Fujitsu's A64FX) or updated some of the underlying +libraries (e.g., Eigen) and want to see if the changes have improved +the performance. + +Though the performance benchmark can be run in sequential mode it is +recommended to configure it with `GISMO_WITH_OPENMP=ON` enabled to +take full advantage of G+Smo's OpenMP parallelization. + +A list of all available benchmarks can be printed by running +`./bin/performance_benchmark --list` which yields + +~~~~~text + + G+Smo + Geometry plus Simulation modules + version 21.12.0 +Compiled by AppleClang 13.0.0.13000029 (C++ 201103, libc++ 12000, eigen 3.4.0) +Running on Apple M1 (memory 8 GB) with real_t:double, index_t:int, short_t:int +web: http://github.com/gismo + +The following benchmarks are available: +#01: Memory copy (native C array) +#02: Memory copy (gsVector) +#03: Dot product (native C array) +[...] +~~~~~ + +\section PerformanceBenchmarkRunningTheBenchmark Running the performance benchmark + +To run the full performance benchmark with the default configuration simply type + +~~~~bash +$> ./bin/performance_benchmark -all -o benchmark.tex +[...] +[memcopyCarray] Memory copy (native C array) +100(100)...400(66)...1600(44)...6400(29)...25600(19)...102400(12)... ...1677721600(1)[failed!]... +[memcopyEigen] Memory copy (gsVector) +100(100)...400(66)...1600(44)...6400(29)...25600(19)...102400(12)... ...1677721600(1)[failed!]... +[...] +~~~~ + +By using the `-o` flag the detailed output is written to the file +`benchmark.tex`. + + +In default mode, the performance benchmark runs each benchmark for a +sequence of increasing problem sizes starting at 100 and increasing +the problem size by a factor of 4 until the total system memory is +exceeded. The latter is indicated in the output above by the trailing +`[failed!]`. We will explain below how this case is handled by a +`memory_safeguard` mechanism that detects insufficient memory without +trying to allocate the memory in the first place. The value in braces, +e.g., `400(66)`, indicates the number of runs the particular test is +executed, here 66 times. For very small problem sizes it is advisable +to run the same test multiple times and average the result over the +number of runs to reduce the influence of inaccurate time +measurements. + +The output file `benchmark.tex` is transformed into a PDF file using +the command \c pdflatex (see https://www.latex-project.org): + +\image html figs/performance_benchmark_memcopy1.pdf + +Each group represents a different problem size. By default, each +problem size is run with 1, 2, 4, ..., `omp_get_max_threads()` OpenMP +threads, which is represented by the different bars. The above plot +shows the speedup achieved with multiple OpenMP threads for moderate +problem sizes (e.g., 1 and 6 MB) and the saturation of the memory +subsystem around 60 GB/s for problem sizes larger than 100 MB. The +figure below shows the same benchmark but implemented with the \ref +gsVector class instead of native C arrays. + +\image html figs/performance_benchmark_memcopy2.pdf + +When running all benchmarks (`--all` flag) the output file will +contain additional plots that compare benchmarks of the same type, +e.g., memory copy of C-style arrays and \ref gsVector. + +\image html figs/performance_benchmark_memcopy3.pdf + +Since the values can differ by orders of magnitude it might be useful +to replace `\begin{axis}...\end{axis}` by +`\begin{semilogyaxis}...\end{semilogyaxis}` in the output file +`benchmark.tex` before executing the `pdflatex` command to produce +plots with logarithmic y-axis. + +A list of benchmark results for different computer architectures, +compilers, and operating systems is mainted at the G+Smo Wiki. + +\section PerformanceBenchmarkCustomizingTheBenchmark Customizing the performance benchmark + +The performance benchmark can be customized using various command-line +arguments. Individual benchmarks can be selected using the `-b` flag, +e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 -b 4 -o benchmark.tex +~~~~ + +will run *benchmark #1* (memory copy (native C array)) and *benchmark +#4* (dot-product (\ref gsVector)). + +The problem sizes can be defined by either providing a list of values, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 -v 100 -v 500 -v 1000 -o benchmark.tex +[...] +[memcopyCarray] Memory copy (native C array) +100(100)...500(66)...1000(44)... +~~~~ + +or by providing the smallest (`--vsizemin`) and largest +(`--vsizemax`) problem size and, optionally, the factor (`-V`/`--vsizefactor`) by +which the problem size should be increased, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 --vsizemin 100 --vsizemax 1000 -V 1.2 -o benchmark.tex +[...] +[memcopyCarray] Memory copy (native C array) +100(100)...120(66)...144(44)...172(29)...206(19)...247(12)...296(8)...355(5)...426(3)...511(2)...613(1)...735(1)...882(1)... +~~~~ + +Here, the `vsize`-family of flags refers to all vector-type +benchmarks. Similarly, the `msize`-family of flags (`--msizemin`, +`--msizemax`, `-M`/`--msizefactor`) refers to all matrix-type +benchmarks. + +The sequence of runs can be specified in the same way, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 --vsizemin 100 --vsizemax 1000 -V 1.2 --runsmin 4 --runsmax 80 -R 1.3 -o benchmark.tex +[...] +[memcopyCarray] Memory copy (native C array) +100(80)...120(61)...144(46)...172(35)...206(26)...247(20)...296(15)...355(11)...426(8)...511(6)...613(4)...735(4)...882(4)... +~~~~ + +Here, the smallest problem size is executed 80 times (`--runsmax`) and +for each larger problem instance, the number of runs is successively +reduced by the factor 1.3 (`-R`/`--runsfactor`) but not below 4 (`--runsmin`). + +Finally, the number of OpenMP threads that should be used can be +specified globally by providing an explicit list, e.g., + +~~~~bash +$> ./bin/performance_benchmark -t 1 -t 4 -t 8 -o benchmark.tex +~~~~ + +runs all benchmarks with 1, 4, and 8 OpenMP threads. + + +Instead of writing the detailed output to a LaTeX file it is also possible to create an XML file using the `-o` flag with a filename ending with `.xml`, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 -o benchmark.xml +~~~~ + +The XML file can be opened as shown in the code snippet below + +~~~~cpp +std::string fn="benchmark.xml"; +gsBenchmark benchmark; +gsFileData<> fd(fn); +fd.getId(0, benchmark); +gsInfo << bm; +~~~~ + +This will write the benchmark output to \ref gsInfo + +If this flag is omitted, the output is written to +\ref gsInfo + +~~~~text +[memcopyCarray] Memory copy (native C array) + memsize | 4x (#Threads : Bandwidth in GB/s) + 1 KB | 1 : 6.10e+00 2 : 1.17e+00 4 : 8.77e-01 8 : 3.17e-01 + 6 KB | 1 : 1.17e+01 2 : 9.47e+00 4 : 4.70e+00 8 : 2.62e+00 + 25 KB | 1 : 3.15e+01 2 : 8.28e+00 4 : 1.35e+01 8 : 1.08e+01 + 100 KB | 1 : 4.54e+01 2 : 4.48e+01 4 : 5.06e+01 8 : 6.72e+00 + 400 KB | 1 : 3.48e+01 2 : 5.94e+01 4 : 7.62e+01 8 : 8.33e+01 + 1 MB | 1 : 2.96e+01 2 : 6.92e+01 4 : 1.17e+02 8 : 6.41e+01 + 6 MB | 1 : 2.34e+01 2 : 8.51e+01 4 : 1.17e+02 8 : 1.12e+02 + 25 MB | 1 : 2.78e+01 2 : 5.67e+01 4 : 7.02e+01 8 : 6.77e+01 + 100 MB | 1 : 1.51e+01 2 : 5.85e+01 4 : 5.72e+01 8 : 4.59e+01 + 400 MB | 1 : 1.51e+01 2 : 5.95e+01 4 : 5.69e+01 8 : 4.59e+01 + 1 GB | 1 : 5.48e+00 2 : 7.31e+00 4 : 2.05e+01 8 : 1.76e+01 + 6 GB | 1 : 4.43e+00 2 : 5.71e+00 4 : 6.26e+00 8 : 4.03e+00 +~~~~ + +The above output shows the bandwidth in GB/s of the memory copy +benchmark for different array sizes (rows) and 1, 2, 4, and 8 OpenMP +threads (columns). + +\section PerformanceBenchmarkImplementingAdditionalBenchmarks Implementing additional benchmarks + +To implement additional benchmarks, copy one of the existing ones and +adjust the constructors and member functions accordingly: + +\snippet performance_benchmark.cpp Implement benchmark eigen dense matrix-vector multiplication + +Make sure that all tasks that should not be included in the time +measurement are performed in the constructor. Furthermore, make sure +to instanciate the `memory_safeguard` object ` _msg(n)` first as it +will let the benchmark fail gracefully if the estimated amount of +memory exceeds the system's total memory. The implementation of the +`memory_safeguard` class is given below: + +\snippet performance_benchmark.cpp Implement memory safeguard + +If you are unsure about the exact memory consumption you can return an +upper bound, e.g., expected memory consumption + 10%, in the +benchmark's `size(index_t n)` function. + +\section PerformanceBenchmarkAnnotatedSourceFile Annotated source file + +Here is the full file \c examples/performance_benchmark.cpp. Clicking +on a function or class name will lead you to its reference +documentation. + +\include performance_benchmark.cpp + +*/ + +} diff --git a/examples/heatEquation2_example.cpp b/examples/heatEquation2_example.cpp new file mode 100644 index 0000000000..05133d17fa --- /dev/null +++ b/examples/heatEquation2_example.cpp @@ -0,0 +1,138 @@ +/** @file heatEquation2_example.cpp + + @brief Solves the heat equation using time-stepping + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): S. Moore, A. Mantzaflaris +*/ + +#include + + +using namespace gismo; + +int main(int argc, char *argv[]) +{ + gsCmdLine cmd("Testing the heat equation."); + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } + + // Source function + gsConstantFunction<> f(1,2); + gsInfo << "Source function is: " << f << "\n"; + + // Define Geometry, must be a gsMultiPatch object + gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); + patches.computeTopology(); + + // Boundary conditions + gsBoundaryConditions<> bcInfo; + gsConstantFunction<> g_N(1,2); // Neumann + gsConstantFunction<> g_D(0,2); // Dirichlet + bcInfo.setGeoMap(patches); + bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); + bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + gsInfo<<"Boundary conditions:\n"<< bcInfo <<"\n"; + + gsMultiBasis<> bases( patches ); + + // Number for h-refinement of the computational (trial/test) basis. + int numRefine = 2; + + // Number for p-refinement of the computational (trial/test) basis. + int numElevate = 0; + + // Elevate and p-refine the basis to order k + numElevate + // where k is the highest degree in the bases + if ( numElevate > -1 ) + { + // Find maximum degree with respect to all the variables + int tmp = bases.maxDegree(0); + for (short_t j = 1; j < patches.parDim(); ++j ) + if ( tmp < bases.maxDegree(j) ) + tmp = bases.maxDegree(j); + + // Elevate all degrees uniformly + tmp += numElevate; + bases.setDegree(tmp); + } + + // h-refine the basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + gsInfo << "Patches: "<< patches.nPatches() <<", degree: "<< bases.minCwiseDegree() <<"\n"; + + real_t theta = 0.5; + real_t endTime = 0.1; + int numSteps = 40; + + real_t Dt = endTime / numSteps ; + + const std::string baseName("heat_eq_solution"); + gsParaviewCollection collection(baseName); + + // Generate system matrix and load vector + gsInfo << "Assembling mass and stiffness...\n"; + + gsExprAssembler<> K(1,1); + gsExprAssembler<> M(1,1); + + gsInfo<<"Active options:\n"<< K.options() <<"\n"; + gsInfo<<"Active options:\n"<< M.options() <<"\n"; + + K.setIntegrationElements(bases); + M.setIntegrationElements(bases); + + gsExprEvaluator<> evK(K); + gsExprEvaluator<> evM(M); + + // Set the geometry map + auto G_K = K.getMap(patches); + auto G_M = M.getMap(patches); + + // Set the discretization space + auto u_K = K.getSpace(bases); + auto u_M = M.getSpace(bases); + + u_K.setup(bcInfo, dirichlet::interpolation, 0); + u_M.setup(bcInfo, dirichlet::interpolation, 0); + + // Set the source term + auto ff_K = K.getCoeff(f, G_K); + auto ff_M = M.getCoeff(f, G_M); + + K.initSystem(); + M.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + // Enforce Neumann conditions to right-hand side + auto g_Neumann = K.getBdrFunction(G_K); + K.assembleBdr(bcInfo.get("Neumann"), u_K * g_Neumann.val() * nv(G_K).norm() ); + + // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner + gsSparseSolver<>::CGDiagonal solver; + gsMatrix<> Sol(M.numDofs(), 1); + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + { + // Compute the system for the timestep i (rhs is assumed constant wrt time) + gsInfo << "Solving timestep " << i*Dt << ".\n"; + Sol = solver.compute(M.matrix() + + Dt*theta*K.matrix() + ).solve(Dt*K.rhs() + + (M.matrix()-Dt*(1.0-theta)*K.matrix())*Sol); + } + + gsInfo << "Norm of the solution" << std::endl; + gsInfo << Sol.norm() << std::endl; + + return EXIT_SUCCESS; +} diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp new file mode 100644 index 0000000000..cdf7017e84 --- /dev/null +++ b/examples/performance_benchmark.cpp @@ -0,0 +1,1313 @@ +/** @file performance_benchmark.cpp + + @brief G+Smo performance benchmark + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +//! [Include namespace] +#include + +using namespace gismo; +//! [Include namespace] + +//! [Implement make_vector] +template +std::vector make_vector(T value, std::size_t size) +{ + std::vector v; + for (std::size_t i=0; i +class memory_safeguard +{ +public: + template + memory_safeguard(Args... args) + { + if (T::size(args...) > gsSysInfo::getMemoryInBytes()) + GISMO_ERROR("Insufficient memory"); + } +}; +//! [Implement memory safeguard] + +//! [Implement benchmark native C array memcopy] +/** + * Benchmark: native C array memcopy + */ +template +class benchmark_c_array_memcopy +{ +private: + memory_safeguard _msg; + index_t n; + T *m_x, *m_y; + +public: + benchmark_c_array_memcopy(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]) + { +#pragma omp parallel for simd schedule(static) + for (index_t i=0; i +class benchmark_c_array_dotproduct +{ +private: + memory_safeguard _msg; + const index_t n; + T *m_x, *m_y; + +public: + benchmark_c_array_dotproduct(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]) + { +#pragma omp parallel for simd schedule(static) + for (index_t i=0; i +class benchmark_c_array_axpy +{ +private: + memory_safeguard _msg; + const index_t n; + T *m_x, *m_y, *m_z; + +public: + benchmark_c_array_axpy(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]), m_z(new T[n]) + { +#pragma omp parallel for simd schedule(static) + for (index_t i=0; i +class benchmark_c_array_dense_matmul +{ +private: + memory_safeguard _msg; + const index_t n; + T *m_A, *m_x, *m_y; + +public: + benchmark_c_array_dense_matmul(index_t n) + : _msg(n), n(n), m_A(new T[n*n]), m_x(new T[n]), m_y(new T[n]) + { +#pragma omp parallel for simd schedule(static) + for (index_t i=0; i +class benchmark_eigen_memcopy +{ +private: + memory_safeguard _msg; + const index_t n; + gsVector x,y; + +public: + benchmark_eigen_memcopy(index_t n) + : _msg(n), n(n), x(n), y(n) + { + x.fill((T)1.0); + } + + uint64_t operator()() + { + y.noalias() = x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (2 * uint64_t(n) * sizeof(T)); + } + + static std::string descr() + { + return "Memory copy (gsVector)"; + } + + static std::string label() + { + return "memcopyEigen"; + } + + static constexpr gismo::metric metric() + { + return gismo::metric::bandwidth_gb_sec; + } +}; +//! [Implement benchmark eigen vector memcopy] + +//! [Implement benchmark eigen vector dot-product] +/** + * Benchmark: Eigen vector dot-product + */ +template +class benchmark_eigen_dotproduct +{ +private: + memory_safeguard _msg; + const index_t n; + gsVector x, y; + +public: + benchmark_eigen_dotproduct(index_t n) + : _msg(n), n(n), x(n), y(n) + { + x.fill((T)1.0); + y.fill((T)1.0); + } + + uint64_t operator()() + { + volatile T sum = y.dot(x); + GISMO_UNUSED(sum); + + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (2 * uint64_t(n) * sizeof(T)); + } + + static std::string descr() + { + return "Dot product (gsVector)"; + } + + static std::string label() + { + return "dotproductEigen"; + } + + static constexpr gismo::metric metric() + { + return gismo::metric::bandwidth_gb_sec; + } +}; +//! [Implement benchmark eigen vector dot-product] + +//! [Implement benchmark eigen vector AXPY] +/** + * Benchmark: Eigen vector AXPY + */ +template +class benchmark_eigen_axpy +{ +private: + memory_safeguard _msg; + const index_t n; + gsVector x, y, z; + +public: + benchmark_eigen_axpy(index_t n) + : _msg(n), n(n), x(n), y(n), z(n) + { + x.fill((T)1.0); + y.fill((T)1.0); + } + + uint64_t operator()() + { + z.noalias() = (T)3.141*x + y; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = z[n-1]; + GISMO_UNUSED(tmp); + + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (3 * uint64_t(n) * sizeof(T)); + } + + static std::string descr() + { + return "AXPY (gsVector)"; + } + + static std::string label() + { + return "axpyEigen"; + } + + static constexpr gismo::metric metric() + { + return gismo::metric::bandwidth_gb_sec; + } +}; +//! [Implement benchmark eigen vector AXPY] + +//! [Implement benchmark eigen dense matrix-vector multiplication] +/** + * Benchmark: Eigen dense matrix-vector multiplication + */ +template +class benchmark_eigen_dense_matmul +{ +private: + // The memory safeguard will ensure that the benchmark fails + // gracefully (i.e. without trying to actually allocate memory) if + // the estimated amount of memory exceeds the system's total memory + memory_safeguard _msg; + const index_t n; + gsMatrix A; + gsVector x, y; + +public: + // All tasks that should not be included in the time measurement + // must be performed in the constructor. Make sure to instanciate + // _msg(n) first as it will let the benchmark fail gracefully if the + // estimated amount of memory exceeds the system's total memory + benchmark_eigen_dense_matmul(index_t n) + : _msg(n), n(n), A(n,n), x(n), y(n) + { + A.fill(1.0); + x.fill(1.0); + } + + uint64_t operator()() + { + y.noalias() = A*x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + // This function will be called by the memory_safeguard to determine + // whether the benchmark will exceed the system's total memory + static constexpr uint64_t size(index_t n) + { + return (2 * uint64_t(n) * uint64_t(n) + uint64_t(n)) * sizeof(T); + } + + static std::string descr() + { + return "Dense matrix-vector multiplication (gsMatrix/gsVector)"; + } + + static std::string label() + { + return "densematmulEigen"; + } + + static constexpr gismo::metric metric() + { + return gismo::metric::bandwidth_gb_sec; + } +}; +//! [Implement benchmark eigen dense matrix-vector multiplication] + +//! [Implement benchmark Poisson 2d visitor] +/** + * Benchmark: Visitor-based Poisson 2d + */ +template +class benchmark_poisson2d_visitor +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsConstantFunction f; + gsBoundaryConditions bc; + gsPoissonAssembler assembler; + +public: + template + benchmark_poisson2d_visitor(std::tuple args) + : benchmark_poisson2d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson2d_visitor(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineSquareGrid(numPatches, numPatches, 1.0)), + bases(geo), f(0.0, 0.0, 2) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // create assembler + assembler = gsPoissonAssembler(geo, bases, bc, f, dirichlet::nitsche, iFace::glue); + } + + uint64_t operator()() + { + assembler.assemble(); + return sizeof(T) * (assembler.matrix().nonZeros() + assembler.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^2 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) + 1 ) * + (/* numPatches^2 * DOFs per patch */ + math::pow(numPatches,2) * math::pow((1< +class benchmark_poisson3d_visitor +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsConstantFunction f; + gsBoundaryConditions bc; + gsPoissonAssembler assembler; + +public: + template + benchmark_poisson3d_visitor(std::tuple args) + : benchmark_poisson3d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson3d_visitor(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineCubeGrid(numPatches, numPatches, numPatches, 1.0)), + bases(geo), f(0.0, 0.0, 0.0, 3) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // create assembler + assembler = gsPoissonAssembler(geo, bases, bc, f, dirichlet::nitsche, iFace::glue); + } + + uint64_t operator()() + { + assembler.assemble(); + return sizeof(T) * (assembler.matrix().nonZeros() + assembler.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^3 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * 1.33 * (numPatches * ((1< +class benchmark_poisson2d_expression_assembler +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsBoundaryConditions bc; + + gsExprAssembler A; + typename gsExprAssembler<>::geometryMap G; + typename gsExprAssembler<>::space u; + + gsFunctionExpr f; + expr::gsComposition ff; + +public: + template + benchmark_poisson2d_expression_assembler(std::tuple args) + : benchmark_poisson2d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson2d_expression_assembler(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineSquareGrid(numPatches, numPatches, 1.0)), + bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), + f("0.0", 2), ff(A.getCoeff(f, G)) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // set the geometry map to boundary conditions + bc.setGeoMap(geo); + + // setup boundary conditions + u.setup(bc, dirichlet::l2Projection, 0); + + // set elements used for numerical integration + A.setIntegrationElements(bases); + + // initialize the system + A.initSystem(); + } + + uint64_t operator()() + { + // Compute the system matrix and right-hand side + A.assemble( + igrad(u, G) * igrad(u, G).tr() * meas(G) //matrix + , + u * ff * meas(G) //rhs vector + ); + + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^2 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) + 1 ) * + (/* numPatches^2 * DOFs per patch */ + math::pow(numPatches,2) * math::pow((1< +class benchmark_poisson3d_expression_assembler +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsBoundaryConditions bc; + + gsExprAssembler A; + typename gsExprAssembler<>::geometryMap G; + typename gsExprAssembler<>::space u; + + gsFunctionExpr f; + expr::gsComposition ff; + +public: + template + benchmark_poisson3d_expression_assembler(std::tuple args) + : benchmark_poisson3d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson3d_expression_assembler(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineCubeGrid(numPatches, numPatches, numPatches, 1.0)), + bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), + f("0.0", 3), ff(A.getCoeff(f, G)) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // set the geometry map to boundary conditions + bc.setGeoMap(geo); + + // setup boundary conditions + u.setup(bc, dirichlet::l2Projection, 0); + + // set elements used for numerical integration + A.setIntegrationElements(bases); + + // initialize the system + A.initSystem(); + } + + uint64_t operator()() + { + // Compute the system matrix and right-hand side + A.assemble( + igrad(u, G) * igrad(u, G).tr() * meas(G) //matrix + , + u * ff * meas(G) //rhs vector + ); + + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^3 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * 1.33 * (numPatches * ((1< benchmarks, msizes, nruns, nthreads, patches, subdivides, vsizes; + index_t msizemin = 10; + index_t nrunsmax = 100; + index_t nrunsmin = 1; + index_t patchesmax = 128; + index_t patchesmin = 1; + index_t subdividemax = 10; + index_t subdividemin = 0; + index_t vsizemin = 100; + real_t patchesfactor = 2; + real_t msizefactor = 2; + real_t nrunsfactor = 1.5; + real_t vsizefactor = 4; + index_t msizemax = (index_t) math::min((real_t)std::numeric_limits::max(), + std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); + index_t vsizemax = (index_t) math::min((real_t)std::numeric_limits::max(), + (real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()); + + gsCmdLine cmd("G+Smo performance benchmark."); + cmd.printVersion(); + + cmd.addReal("M", "msizefactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizefactor); + cmd.addReal("P", "patchesfactor", "Growth factor for the sequence of patches (only used if '-p' is not given)", patchesfactor); + cmd.addReal("R", "runsfactor", "Growth factor for the sequence of runs (only used if '-r' is not given)", nrunsfactor); + cmd.addReal("V", "vsizefactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizefactor); + cmd.addInt("", "msizemax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizemax); + cmd.addInt("", "msizemin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizemin); + cmd.addInt("", "patchesmax", "Maximum number of patches in assembly benchmarks (only used if '-p' is not given)", patchesmax); + cmd.addInt("", "patchesmin", "Minimum number of patches in assembly benchmarks (only used if '-p' is not given)", patchesmin); + cmd.addInt("", "runsmax", "Maximum number of runs (only used if '-r' is not given)", nrunsmax); + cmd.addInt("", "runsmin", "Mminimum number of runs (only used if '-r' is not given)", nrunsmin); + cmd.addInt("", "subdividemax", "Maximum number of subdivisions (h-refinement) in assembly benchmarks (only used if '-r' is not given)", subdividemax); + cmd.addInt("", "subdividemin", "Minimum number of subdivisions (h-refinement) in assembly benchmarks (only used if '-r' is not given)", subdividemin); + cmd.addInt("", "vsizemax", "Maximum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizemax); + cmd.addInt("", "vsizemin", "Mminimum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizemin); + cmd.addMultiInt("b", "benchmarks", "List of benchmarks to be run", benchmarks); + cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks (auto-generated if not given)", msizes); + cmd.addMultiInt("p", "patches", "Number of patches in assembly benchmarks (auto-generated if not given)", patches); + cmd.addMultiInt("r", "runs", "Number of runs over which the results are averaged (auto-generated if not given)", nruns); + cmd.addMultiInt("s", "subdivide", "Number of subdivisions (h-refinement) in assembly benchmarks (auto-generated if not given)", subdivides); + cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark (auto-generated if not given)", nthreads); + cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks (auto-generated if not given)", vsizes); + cmd.addString("o", "output", "Name of the output file", fn); + cmd.addSwitch("list", "List all benchmarks and exit", list); + cmd.addSwitch("all", "Run all benchmarks", all); + + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } + //! [Parse command line] + + //! [List benchmarks and exit] + if (list) { + gsInfo << "\nThe following benchmarks are available:\n" + << "#01: " << benchmark_c_array_memcopy::descr() << "\n" + << "#02: " << benchmark_eigen_memcopy::descr() << "\n" + << "#03: " << benchmark_c_array_dotproduct::descr() << "\n" + << "#04: " << benchmark_eigen_dotproduct::descr() << "\n" + << "#05: " << benchmark_c_array_axpy::descr() << "\n" + << "#06: " << benchmark_eigen_axpy::descr() << "\n" + << "#07: " << benchmark_c_array_dense_matmul::descr() << "\n" + << "#08: " << benchmark_eigen_dense_matmul::descr() << "\n" + << "#09: " << benchmark_poisson2d_visitor::descr() + << " with increasing number of patches" << "\n" + << "#10: " << benchmark_poisson2d_visitor::descr() + << " with increasing number of subdivisions" << "\n" + << "#11: " << benchmark_poisson3d_visitor::descr() + << " with increasing number of patches" << "\n" + << "#12: " << benchmark_poisson3d_visitor::descr() + << " with increasing number of subdivisions" << "\n" + << "#13: " << benchmark_poisson2d_expression_assembler::descr() + << " with increasing number of patches" << "\n" + << "#14: " << benchmark_poisson2d_expression_assembler::descr() + << " with increasing number of subdivisions" << "\n" + << "#15: " << benchmark_poisson3d_expression_assembler::descr() + << " with increasing number of patches" << "\n" + << "#16: " << benchmark_poisson3d_expression_assembler::descr() + << " with increasing number of subdivisions" << "\n"; + + return EXIT_SUCCESS; + } + //! [List benchmarks and exit] + + //! [Default configuration] + // If empty fill with all benchmarks 1, 2, ... + if (all) { + benchmarks.clear(); + for(index_t i=1; i<=16; ++i) + benchmarks.push_back(i); + } + + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads + if (nthreads.empty()) { + for(index_t i=1; i<=omp_get_max_threads(); i*=2) + nthreads.push_back(i); + } + + // If empty fill with msizemin*msizefactor^k, k=0, 1, 2, ..., msizemax + if (msizes.empty()) { + for(index_t i=msizemin;;) { + msizes.push_back(i); + if (i<=math::min(msizemax, std::numeric_limits::max()) / (msizefactor*msizefactor)) + i*=msizefactor; + else + break; + } + } + + // If empty fill with patchesmin, ..., patchesmax + if (patches.empty()) { + for(index_t i=patchesmin; i<=patchesmax; i*=patchesfactor) + patches.push_back(i); + } + + // If empty fill with subdividemin, ..., subdividemax + if (subdivides.empty()) { + for(index_t i=subdividemin; i::max()) / vsizefactor) + i*=vsizefactor; + else + break; + } + } + + // If empty fill with nrunsmax/nrunsfactor^k, k=0, 1, 2, ..., nrunsmin + if (nruns.empty()) { + index_t k = nrunsmax; + for(index_t i=0; i<(index_t)math::max(msizes.size(), patches.size(), + subdivides.size(), vsizes.size()); ++i) { + nruns.push_back(k); + k = math::max(nrunsmin, (index_t)(k/nrunsfactor)); + } + } + + if (nruns.size() > + (vsizes, nruns, nthreads); + break; + } + + case (2): { + // Benchmark: memcopy gsVector + benchmark.create > + (vsizes, nruns, nthreads); + break; + } + + case (3): { + // Benchmark: dot-product native C array + benchmark.create > + (vsizes, nruns, nthreads); + break; + } + + case (4): { + // Benchmark: dot-product gsVector + benchmark.create > + (vsizes, nruns, nthreads); + break; + } + + case (5): { + // Benchmark: axpy native C array + benchmark.create > + (vsizes, nruns, nthreads); + break; + } + + case (6): { + // Benchmark: axpy gsVector + benchmark.create > + (vsizes, nruns, nthreads); + break; + } + + case (7): { + // Benchmark: dense matrix-vector multiplication native C array + benchmark.create > + (msizes, nruns, nthreads); + break; + } + + case (8): { + // Benchmark: dense matrix-vector multiplication gsMatrix/gsVector + benchmark.create > + (msizes, nruns, nthreads); + break; + } + + case (9): { + // Benchmark: visitor-based Poisson 2d assembler with increasing number of patches + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)3, patches.size())), // degree : 3 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=3)"); + break; + } + + case (10): { + // Benchmark: visitor-based Poisson 2d assembler with increasing number of subdivisions + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 + subdivides, + make_vector((index_t)3, subdivides.size())), // degree : 3 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=3)"); + break; + } + + case (11): { + // Benchmark: visitor-based Poisson 3d assembler with increasing number of patches + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)2, patches.size())), // degree : 2 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=2)"); + break; + } + + case (12): { + // Benchmark: visitor-based Poisson 3d assembler with increasing number of subdivisions + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 + subdivides, + make_vector((index_t)2, subdivides.size())), // degree : 2 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); + break; + } + + case (13): { + // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)3, patches.size())), // degree : 3 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=3)"); + break; + } + + case (14): { + // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of subdivision + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 + subdivides, + make_vector((index_t)3, subdivides.size())), // degree : 3 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=3)"); + break; + } + + case (15): { + // Benchmark: expression assembler-based Poisson 3d assembler with increasing number of patches + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)2, patches.size())), // degree : 2 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=2)"); + break; + } + + case (16): { + // Benchmark: expression assembler-based Poisson 3d assembler with increasing number of subdivision + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 + subdivides, + make_vector((index_t)2, subdivides.size())), // degree : 2 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); + break; + } + + default: + GISMO_ERROR("Invalid benchmark"); + } + + } // benchmark loop + + { // Memory copy ratio + auto bmA = benchmark.find(benchmark_c_array_memcopy::label()); + auto bmB = benchmark.find(benchmark_eigen_memcopy::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = util::ratio("memcopyRatio", + "Memory copy (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // Dot product ratio + auto bmA = benchmark.find(benchmark_c_array_dotproduct::label()); + auto bmB = benchmark.find(benchmark_eigen_dotproduct::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = util::ratio("dotproductRatio", + "Dot product (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // AXPY ratio + auto bmA = benchmark.find(benchmark_c_array_axpy::label()); + auto bmB = benchmark.find(benchmark_eigen_axpy::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = util::ratio("axpyRatio", + "AXPY (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // Dense matrix-vector multiplication ratio + auto bmA = benchmark.find(benchmark_c_array_dense_matmul::label()); + auto bmB = benchmark.find(benchmark_eigen_dense_matmul::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = util::ratio("densematmulRatio", + "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", + *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + if (fn.empty()) + gsInfo << benchmark << "\n"; + else if (gsFileManager::getExtension(fn) == "tex") { + std::ofstream file; + file.open(fn); + benchmark.to_tikz(file); + file.close(); + } + else if (gsFileManager::getExtension(fn) == "xml") { + gsFileData<> file; + file << benchmark; + file.save(fn); + } + else { + GISMO_ERROR("Unsupported file extension"); + } + //! [Execute benchmarks] + + return EXIT_SUCCESS; +} diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt new file mode 100644 index 0000000000..2a490a97ff --- /dev/null +++ b/extensions/gsXBraid/CMakeLists.txt @@ -0,0 +1,150 @@ +### CMakeLists.txt --- +## +## Author: Angelos Mantzaflaris +## Copyright (C) 2016 - RICAM-Linz. +###################################################################### + +## XBraid extension +project(gsXBraidExtension) + +# Collect file names +aux_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HEADERS) +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_SOURCES) +aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) + +# Apply same configuration as G+Smo +include(gsConfig) + +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + add_definitions(-D_CRT_NONSTDC_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) +endif() + +# Look for pre-installed XBraid libraries +find_package(XBRAID QUIET) + +if (NOT XBRAID_FOUND) + # Set XBraid version + set(XBRAID_VER "master") + + # Download XBraid sources at configure time + include(gsFetch) + gismo_fetch_directory(XBraid + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + DESTINATION external + ) + + if( (NOT GISMO_BUILD_LIB) ) + aux_instance_directory (${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_INS) + if(${PROJECT_NAME}_INS) + LIST( REMOVE_ITEM ${PROJECT_NAME}_CPP ${${PROJECT_NAME}_INS}) + endif() + endif() + + # Set XBraid library header files + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/_braid.h + ${gismo_externals}/XBraid/braid/base.h + ${gismo_externals}/XBraid/braid/status.h + ${gismo_externals}/XBraid/braid/tape.h + ${gismo_externals}/XBraid/braid/util.h + ${gismo_externals}/XBraid/braid/braid.h + ${gismo_externals}/XBraid/braid/braid_status.h + ${gismo_externals}/XBraid/braid/braid_test.h) + + if(NOT GISMO_WITH_MPI ) + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/mpistubs.h) + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/mpistubs.c) + endif() + + # Set XBraid library sources files + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/access.c + ${gismo_externals}/XBraid/braid/adjoint.c + ${gismo_externals}/XBraid/braid/base.c + ${gismo_externals}/XBraid/braid/braid.c + ${gismo_externals}/XBraid/braid/braid_status.c + ${gismo_externals}/XBraid/braid/braid_test.c + ${gismo_externals}/XBraid/braid/communication.c + ${gismo_externals}/XBraid/braid/distribution.c + ${gismo_externals}/XBraid/braid/drive.c + ${gismo_externals}/XBraid/braid/grid.c + ${gismo_externals}/XBraid/braid/hierarchy.c + ${gismo_externals}/XBraid/braid/interp.c + ${gismo_externals}/XBraid/braid/norm.c + ${gismo_externals}/XBraid/braid/refine.c + ${gismo_externals}/XBraid/braid/relax.c + ${gismo_externals}/XBraid/braid/residual.c + ${gismo_externals}/XBraid/braid/restrict.c + ${gismo_externals}/XBraid/braid/space.c + ${gismo_externals}/XBraid/braid/step.c + ${gismo_externals}/XBraid/braid/tape.c + ${gismo_externals}/XBraid/braid/util.c + ${gismo_externals}/XBraid/braid/uvector.c) + +# Set XBraid library include files + set(XBRAID_INCLUDE_DIR ${gismo_externals}/XBraid/braid CACHE INTERNAL "") + include_directories(${XBRAID_INCLUDE_DIR}) + +endif (NOT XBRAID_FOUND) + +# Compile gsXBraid extension as part of the G+Smo library +add_library(${PROJECT_NAME} OBJECT + ${${PROJECT_NAME}_HEADERS} + ${${PROJECT_NAME}_HPPFILES} + ${${PROJECT_NAME}_SOURCES} + ) + +# Set standard properties for all G+Smo extensions +set_target_properties(${PROJECT_NAME} PROPERTIES + COMPILE_DEFINITIONS gismo_EXPORTS + POSITION_INDEPENDENT_CODE ON + LINKER_LANGUAGE CXX + #START Export all symbols from this extension + CXX_VISIBILITY_PRESET default + C_VISIBILITY_PRESET default + VISIBILITY_INLINES_HIDDEN 0 + #END Export all symbols from this extension + FOLDER "G+Smo extensions" + ) + +if( GISMO_WITH_MPI ) + target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) +else() + add_definitions("-Dbraid_SEQUENTIAL") +endif() + +# Add gsXBraid extension to the list of G+Smo extensions +set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ + CACHE INTERNAL "gismo extensions to be included") + +# Add XBraid include directories to G+Smo standard include directories +set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "gismo include directories") + +# Install gsXBraid header files +install(DIRECTORY ${PROJECT_SOURCE_DIR} + DESTINATION include/gismo/gsXBraid + FILES_MATCHING PATTERN "*.h") + +# Add filedata folder +add_definitions(-DXBRAID_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/filedata/") + +# Add example files +include_directories(${CODIPACK_INCLUDE_DIR}) +aux_cpp_directory(${CMAKE_CURRENT_SOURCE_DIR}/examples FILES) +foreach(file ${FILES}) + add_gismo_executable(${file}) + get_filename_component(tarname ${file} NAME_WE) # name without extension + set_property(TEST ${tarname} PROPERTY LABELS "${PROJECT_NAME}") + set_target_properties(${tarname} PROPERTIES FOLDER "${PROJECT_NAME}") + if( GISMO_WITH_MPI ) + target_include_directories(${tarname} PRIVATE ${MPI_INCLUDE_PATH}) + endif() + # Install the example executables (optionally) + install(TARGETS ${tarname} DESTINATION "${BIN_INSTALL_DIR}" COMPONENT exe OPTIONAL) +endforeach(file ${FILES}) + +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin/) diff --git a/extensions/gsXBraid/README.md b/extensions/gsXBraid/README.md new file mode 100644 index 0000000000..b64311843c --- /dev/null +++ b/extensions/gsXBraid/README.md @@ -0,0 +1,113 @@ +# XBraid extension + +G+Smo extension for the [XBraid - Parallel-in-time Solver Package](https://github.com/XBraid/xbraid). + +|CMake flags|```-DGISMO_WITH_XBRAID=ON``` (default ```OFF```)| +|--:|---| +|Required additional CMake flags|```-DGISMO_WITH_MPI=ON``` (recommended)
```-DGISMO_WITH_OPENMP=ON``` (optionally)| +|License|[MPL 2.0](https://www.mozilla.org/en-US/MPL/2.0/)| +|OS support|Linux, Windows, macOS| +|Status|completed| +|Developer|Matthias Möller| +|Maintainer|M.Moller@tudelft.nl| +|Last checked|05-05-2021| + +*** +__Table of content__ +1. [Introduction](#introduction) +2. [Usage example](#usage_example) +*** + +__Introdution__ + +The XBraid extension builds on the open-source +[XBraid](https://github.com/XBraid/xbraid) package developed at [ +Lawrence Livermore National +Laboratory](https://computation.llnl.gov/projects/parallel-time-integration-multigrid/), +and at collaborating [academic +institutions](https://github.com/XBraid/xbraid/wiki/Team). XBraid is a +non-intrusive, optimal-scaling parallel-in-time solver that builds on +multigrid reduction techniques (multigrid-reduction-in-time or MGRIT). + +The XBraid extension provides a generic wrapper to XBraid's C++ +interface that can be easily customized by deriving an application +from the class `gsXBraid` and overriding some or all virtual methods: + +```cpp +virtual braid_Int Access(braid_Vector, BraidAccessStatus&); +virtual braid_Int BufPack(braid_Vector, void*, BraidBufferStatus&); +virtual braid_Int BufSize(braid_Int*, BraidBufferStatus&); +virtual braid_Int BufUnpack(void*, braid_Vector*, BraidBufferStatus&); +virtual braid_Int Clone(braid_Vector, braid_Vector*); +virtual braid_Int Coarsen(braid_Vector, braid_Vector*, BraidCoarsenRefStatus&); +virtual braid_Int Free(braid_Vector); +virtual braid_Int Init(braid_Real, braid_Vector*); +virtual braid_Int Refine(braid_Vector, braid_Vector*, BraidCoarsenRefStatus&); +virtual braid_Int Residual(braid_Vector, braid_Vector, BraidStepStatus&); +virtual braid_Int SpatialNorm(braid_Vector, braid_Real*); +virtual braid_Int Step(braid_Vector, braid_Vector, braid_Vector, BraidStepStatus&); +virtual braid_Int Sum(braid_Real, braid_Vector, braid_Real, braid_Vector); +``` + +__Usage example__ + +The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of the gsXBraid extension. + +1. Configuration and compilation (MPI-only mode) + + ```bash + mkdir build + cd build + cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON + make xbraid_heatEquation_example -j4 + ``` + +2. Execution (MPI-only mode) + + ```bash + mpirun -np --hostfile ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` + + This will solve the two-dimensional heat equation on a unit square + with 250 time steps in the time interval [0, 0.1] using + MPI processes. The `hostfile` should have the following structure + + ```text + node0 slots=#slots max_slots=#maximum slots + node1 slots=#slots max_slots=#maximum slots + ... + ``` + + The spatial domain is 6 times regularly refined in space (h-refinement) + and the approximation order is increased 3 times (p-refinement). + Order elevation instead of order increase can be achieved by replacing + the switch`-i` by `-e`. + + For a complete list of command-line argument run + ```bash + ./bin/xbraid_heatEquation_example -h + ``` + +3. Configuration and compilation (MPI-OpenMP mode) + + ```bash + mkdir build + cd build + cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON -DGISMO_WITH_OPENMP=ON + make xbraid_heatEquation_example -j4 + ``` + +4. Execution (MPI-OpenMP mode) + + ```bash + mpirun -np --hostfile -x OMP_NUM_THREADS= ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` + + The additional parameter `-x OMP_NUM_THREADS=` ensures that + each MPI process executes `NTHREAD` OpenMP threads in parallel. The `-x` + flag is not supported by all MPI implementations. If it does not work + try + + ```bash + mpirun -np --hostfile -env OMP_NUM_THREADS ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` diff --git a/extensions/gsXBraid/examples/gsXBraidMultigrid.h b/extensions/gsXBraid/examples/gsXBraidMultigrid.h new file mode 100644 index 0000000000..a87d169592 --- /dev/null +++ b/extensions/gsXBraid/examples/gsXBraidMultigrid.h @@ -0,0 +1,1193 @@ +#include +#include + +namespace gismo { + + /** @brief The p-multigrid base class provides the basic + * methods (smoothing, prolongation, restriction) for + * implementing p-multigrid methods + */ + + template + struct gsXBraidMultigridBase + { + protected: + int maxIter; + int numLevels; + int numSmoothing; + int typeBCHandling; + int typeCycle_h; + int typeCycle_p; + int typeLumping; + int typeProjection; + int typeSmoother; + gsMatrix<> hp; + T tol; + + public: + /// @brief Constructor + gsXBraidMultigridBase() + : maxIter(100000), + numLevels(1), + numSmoothing(1), + typeBCHandling(1), + typeCycle_h(2), + typeCycle_p(1), + typeLumping(1), + typeProjection(1), + typeSmoother(1), + tol(1e-8) + {} + + void setMaxIter(int maxIter) + { this->maxIter = maxIter; } + + void setTolerance(T tol) + { this->tol = tol; } + + void setNumLevels(int numLevels, int typeProjection, int numDegree) + { + if(typeProjection == 1) + { + this->numLevels = numLevels - numDegree + 2; + } + else + { + this->numLevels = numLevels; + } + } + + void setNumSmoothing(int numSmoothing) + { this->numSmoothing = numSmoothing; } + + void setTypeBCHandling(int typeBCHandling) + { this->typeBCHandling = typeBCHandling; } + + void setTypeCycle_h(int typeCycle_h) + { this->typeCycle_h = typeCycle_h; } + + void setTypeCycle_p(int typeCycle_p) + { this->typeCycle_p = typeCycle_p; } + + void setTypeLumping(int typeLumping) + { this->typeLumping = typeLumping; } + + void setTypeProjection(int typeProjection) + { this->typeProjection = typeProjection; } + + void setTypeSmoother(int typeSmoother) + { this->typeSmoother = typeSmoother; } + + void setCoarsening(gsMatrix<> hp) + { this->hp = hp; } + + virtual gsXBraidMultigridBase& compute(const gsSparseMatrix& mat, const T tstep, const int& numDegree, index_t typeMethod) + { + // Get arguments explicitly + gsMatrix x = gsMatrix<>::Zero(mat.rows(),1); + gsMatrix b = gsMatrix<>::Zero(mat.rows(),1); + gsFunctionExpr<> rhs("1",2); + int iterTot = 1; + int typeMultigrid = 2; + int typeCoarseOperator = 1; + + /// @brief Set-up p-multigrid solver + setup(rhs, + x, + b, + iterTot, + numLevels, + numDegree, + typeMultigrid, + hp, + typeCoarseOperator, + tstep, + typeMethod); + + + return *this; } + + virtual gsMatrix solveWithGuess(const gsMatrix& b, + const gsMatrix& x0) + { + // Get arguments explicitly + gsMatrix x(x0); + x = x0; + + gsFunctionExpr<> rhs("1",2); + int iterTot = 1; + int typeMultigrid = 2; + int typeCoarseOperator = 1; + + /// @brief Apply p-multigrid solver to given right-hand side on level l + solve(rhs, + x, + b, + iterTot, + numLevels, + typeMultigrid, + hp, + typeCoarseOperator); + return x; + } + + /// @brief Apply p-multigrid solver to given right-hand side on level l + virtual void solveMG(const gsMatrix & rhs, + std::vector > > m_bases, + gsMatrix& x, + const int& numLevels, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if ( numLevels == 1) + { + solvecoarse(rhs, x, numLevels); + return; + } + + if (hp(std::max(numLevels-2,0),0) == 0 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, fineRes, hp); + restriction(fineRes, coarseRes, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int j = 0 ; j < (typeCycle_p == 2 ? 2 : 1) ; j++) + { + solveMG(coarseRes, m_bases, coarseCorr, numLevels-1, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs, x, numLevels, fineCorr, postRes, + hp); + } + + if (hp(std::max(numLevels-2,0),0) == 1 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, fineRes, hp); + restriction(fineRes, coarseRes, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int i = 0 ; i < (typeCycle_h == 2 ? 2 : 1) ; i++) + { + solveMG(coarseRes, m_bases, coarseCorr, numLevels-1, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs,x, numLevels, fineCorr, postRes, + hp); + } + } + + virtual void setup(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& numDegree, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator, + T tstep, + index_t typeMethod){} + + virtual void solve(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator){} + + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineRes , + const gsMatrix& hp) = 0; + + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void postsmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineCorr, + gsMatrix & postRes, + const gsMatrix& hp) = 0; + + /// @brief Apply coarse solver (pure virtual method) + virtual void solvecoarse(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_bases) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_bases) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix prolongation_M(const int& numLevels, + std::vector > > m_bases) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_bases) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual void prolongation(const gsMatrix& Xcoarse, + gsMatrix& Xfine, + const int& numLevels, + std::vector > > m_bases, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if (hp(numLevels-2,0) == 1) + { + Xfine = m_prolongation_H[numLevels-2]*Xcoarse; + } + else + { + if (typeLumping == 1) + { + gsMatrix temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsMatrix M_L_inv = (m_prolongation_M[numLevels-2]).array().inverse(); + Xfine = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + + // Determine matrix M (high_order * high_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basesH ,1, 0); + w_n.setInterfaceCont(0); + if (typeBCHandling == 1) + { + w_n.setup(bcInfo, dirichlet::l2Projection, 0); + //#w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basesH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); + + // Prolongate Xcoarse to Xfine + gsMatrix temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsSparseMatrix M = ex2.matrix(); + gsConjugateGradient CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp,Xfine); + } + } + } + + /// @brief Restrict fine space function to coarse space + virtual void restriction(const gsMatrix& Xfine, + gsMatrix& Xcoarse, + const int& numLevels, + std::vector > > m_bases, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if (hp(numLevels-2,0) == 1) + { + Xcoarse = m_restriction_H[numLevels-2]*Xfine; + } + else + { + if (typeLumping == 1) + { + // Standard way + gsMatrix temp = m_restriction_P[numLevels-2]*Xfine; + gsMatrix M_L_inv = (m_restriction_M[numLevels-2]).array().inverse(); + Xcoarse = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + + // Determine matrix M (low_order * low_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basesL, 1, 0); + w_n.setInterfaceCont(0); + if (typeBCHandling == 1) + { + w_n.setup(bcInfo, dirichlet::l2Projection, 0); + //#w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basesL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); + + // Restrict Xfine to Xcoarse + gsMatrix temp = m_restriction_P[numLevels-2]*Xfine; + gsSparseMatrix M = ex2.matrix(); + gsConjugateGradient CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp, Xcoarse); + } + } + } + }; + + /** @brief The p-multigrid class implements a generic p-multigrid solver + * that can be customized by passing assembler and coarse + * solver as template arguments. + * + * @note: This implementation assumes that all required prolongation/ + * restriction operators are generated internally. Therefore, a + * problem-specific assembler has to be passed as template argument. + */ + template + struct gsXBraidMultigrid : public gsXBraidMultigridBase + { + private: + + /// Base class type + typedef gsXBraidMultigridBase Base; + + /// Shared pointer to multi-patch geometry + memory::shared_ptr > m_mp_ptr; + + /// Shared pointer to boundary conditions + memory::shared_ptr > m_bcInfo_ptr; + + /// std::vector of multi-basis objects + std::vector > > m_bases; + + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_P; + + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_P; + + /// std::vector of prolongation operators + std::vector< gsMatrix > m_prolongation_M; + + /// std::vector of restriction operators + std::vector< gsMatrix > m_restriction_M; + + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_H; + + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_H; + + /// std::vector of factorized operators + std::vector< std::vector< gsSparseMatrix > > m_ILUT; + + /// std::vector of factorized operators + std::vector< std::vector < Eigen::PermutationMatrix > > m_P; + + /// std::vector of factorized operators + std::vector < std::vector < Eigen::PermutationMatrix > > m_Pinv; + + /// std::vector of SCM smoother object + std::vector< typename gsPreconditionerOp::Ptr > m_SCMS; + + /// std::vector of operator objects + std::vector< gsSparseMatrix > m_operator; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector< gsSparseMatrix > > m_block_operator; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddB; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddC; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddBtilde; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddCtilde; + + /// std::vector of std::vector of block operator objects + std::vector < gsMatrix > m_A_aprox; + + /// std::vector of std::vector of block operator objects + std::vector < gsSparseMatrix > m_S; + + /// std::vector of std::vector of shift objects + std::vector < std::vector< int > > m_shift; + + public: + + // Constructor + gsXBraidMultigrid(const gsMultiPatch & mp, + const gsMultiBasis & bases, + const gsBoundaryConditions & bcInfo) + { + m_mp_ptr = memory::make_shared_not_owned(&mp); + m_bcInfo_ptr = memory::make_shared_not_owned(&bcInfo); + m_bases.push_back(memory::make_shared_not_owned(&bases)); + } + + virtual ~gsXBraidMultigrid() {} + + public: + + /// @brief Set-up p-multigrid solver + void setup(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& numDegree, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator, + T tstep, + index_t typeMethod) + { + for (int i = 1; i < numLevels; i++) + { + m_bases.push_back(give(m_bases.back()->clone())); + switch((int) hp(i-1,0) ) + { + case 0 : (Base::typeProjection == 1 ? + m_bases.back()->degreeIncrease(numDegree-1) : + m_bases.back()->degreeIncrease()); break; + + case 1 : m_bases.back()->uniformRefine(); break; + + case 2: m_bases.back()->uniformRefine(); + m_bases.back()->degreeIncrease(); break; + } + } + + // Generate sequence of matrix K and M + m_operator.resize(numLevels); + gsStopwatch clock; + //gsInfo << "|| Multigrid hierarchy ||" <degree() << ", Ndof: " << m_bases[i]->totalSize() <::geometryMap geometryMap; + typedef typename gsExprAssembler::variable variable; + typedef typename gsExprAssembler::space space; + typedef typename gsExprAssembler::solution solution; + + gsExprAssembler K, M; + + // Set the bases + K.setIntegrationElements(*m_bases[i]); + M.setIntegrationElements(*m_bases[i]); + + // Set the geometry map + geometryMap G_K = K.getMap(*m_mp_ptr); + geometryMap G_M = M.getMap(*m_mp_ptr); + + // Set the discretization space + space u_K = K.getSpace(*m_bases[i]); + space u_M = M.getSpace(*m_bases[i]); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + u_K.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + u_M.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#u_K.addBc( m_bcInfo_ptr->get("Dirichlet") ); + //#u_M.addBc( m_bcInfo_ptr->get("Dirichlet") ); + + // Set the source term + auto ff_K = K.getCoeff(rhs, G_K); + auto ff_M = M.getCoeff(rhs, G_M); + + // Initialize and assemble the system matrix + K.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + + // Initialize and assemble the mass matrix + M.initSystem(); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + + m_operator[i] = M.matrix() + tstep*K.matrix(); + switch(typeMethod) + { + case 0: m_operator[i] = M.matrix(); break; + case 1: m_operator[i] = M.matrix() + tstep*K.matrix(); break; + case 2: m_operator[i] = M.matrix() + 0.5*tstep*K.matrix(); + } + + } + real_t Time_Assembly = clock.stop(); + GISMO_UNUSED(Time_Assembly); + + + // Resize vector of operators + m_prolongation_P.resize(numLevels-1); + m_prolongation_M.resize(numLevels-1); + m_prolongation_H.resize(numLevels-1); + m_restriction_P.resize(numLevels-1); + m_restriction_M.resize(numLevels-1); + m_restriction_H.resize(numLevels-1); + + // Determine prolongation/restriction operators in p + clock.restart(); + for (int i = 1; i < numLevels; i++) + { + if (hp(i-1,0) == 0) + { + m_prolongation_P[i-1] = prolongation_P(i+1, m_bases); + m_restriction_P[i-1] = m_prolongation_P[i-1].transpose(); //restriction_P(i+1, m_bases); + m_prolongation_M[i-1] = prolongation_M(i+1, m_bases); + m_restriction_M[i-1] = restriction_M(i+1, m_bases); + } + } + + // Determine prolongation/restriction operators in h + gsSparseMatrix transferMatrix; + gsOptionList options; + Base::typeBCHandling == 1 ? options.addInt("DirichletStrategy","",dirichlet::elimination) : options.addInt("DirichletStrategy","",dirichlet::nitsche); + for(int i = 1; i < numLevels; i++) + { + if (hp(i-1,0) == 1) + { + gsMultiBasis m_bases_copy = *m_bases[i]; + m_bases_copy.uniformCoarsen_withTransfer(transferMatrix,*m_bcInfo_ptr,options); + m_prolongation_H[i-1] = transferMatrix; + m_restriction_H[i-1] = m_prolongation_H[i-1].transpose(); + } + } + real_t Time_Transfer = clock.stop(); + GISMO_UNUSED(Time_Transfer); + + // Obtain operators with Galerkin projection (TO DO) + clock.restart(); + if (typeCoarseOperator == 2) + { + for (int i = numLevels-1; i > -1; i--) + { + if (hp(hp.rows()-1,0) == 0) + { + if (hp(std::min(i,hp.rows()-1),0) == 1) + { + m_operator[i] = m_restriction_H[i]*m_operator[i+1]*m_prolongation_H[i]; + } + } + else + { + if (hp(std::min(i,hp.rows()-1),0) == 1 && i > 0) + { + m_operator[i-1] = m_restriction_H[i-1]*m_operator[i]*m_prolongation_H[i-1]; + } + } + } + } + real_t Time_Assembly_Galerkin = clock.stop(); + GISMO_UNUSED(Time_Assembly_Galerkin); + + // Setting up the subspace corrected mass smoother + clock.restart(); + if (Base::typeSmoother == 3) + { + // Generate sequence of SCM smoothers + m_SCMS.resize(numLevels); + gsOptionList opt; + opt.addReal("Scaling","",0.12); + for(int i = 0 ; i < numLevels ; i++) + { + m_SCMS[i] = setupSubspaceCorrectedMassSmoother(m_operator[i], *m_bases[i], *m_bcInfo_ptr, opt, Base::typeBCHandling); + } + } + real_t Time_SCMS = clock.stop(); + GISMO_UNUSED(Time_SCMS); + + // Determine ILUT factorizations at each level + clock.restart(); + int numPatch = m_mp_ptr->nPatches(); + + if (Base::typeSmoother == 1) + { + // Generate factorizations (ILUT) + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + for(int i = 0; i < numLevels; i++) + { + m_ILUT[i].resize(1); + m_P[i].resize(1); + m_Pinv[i].resize(1); + if (Base::typeProjection == 2) + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + else + { + if (i == numLevels-1) // Only at finest level + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + } + } + } + real_t Time_ILUT_Factorization = clock.stop(); + GISMO_UNUSED(Time_ILUT_Factorization); + + clock.restart(); + if (Base::typeSmoother == 5) + { + int shift0 = 0; + m_ddB.resize(numLevels); + m_ddC.resize(numLevels); + m_ddBtilde.resize(numLevels); + m_ddCtilde.resize(numLevels); + + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + m_shift.resize(numLevels); + m_S.resize(numLevels); + + for(int i = 0 ; i < numLevels ; i++) + { + m_shift[i].resize(numPatch+1); + m_ILUT[i].resize(numPatch+1); + m_P[i].resize(numPatch+1); + m_Pinv[i].resize(numPatch+1); + + // Use of partition functions + std::vector > interior, boundary; + std::vector > > interface; + std::vector > global_interior, global_boundary; + std::vector > > global_interface; + //m_bases[i]->partition(interior,boundary,interface,global_interior,global_boundary,global_interface); + for(int l=0; l< numPatch; l++) + { + m_shift[i][l] = global_interior[l].rows(); + } + m_shift[i][numPatch] = 0; + m_shift[i][numPatch] = m_operator[i].rows() - accumulate(m_shift[i].begin(),m_shift[i].end(),0); + + // Put shift on zero + shift0 = 0; + for(int j = 0 ; j < numPatch ; j++) + { + const gsSparseMatrix block = m_operator[i].block(shift0,shift0,m_shift[i][j],m_shift[i][j]); + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(block); + m_ILUT[i][j] = ilu.m_lu; + + m_P[i][j] = ilu.m_P; + m_Pinv[i][j] = ilu.m_Pinv; + shift0 = shift0 + m_shift[i][j]; + + } + + shift0 = 0; + // Obtain the blocks of the matrix + m_ddB[i].resize(numPatch+1); + m_ddC[i].resize(numPatch+1); + + for(int j = 0 ; j < numPatch+1 ; j++) + { + m_ddB[i][j] = m_operator[i].block(m_operator[i].rows()-m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][j]); + m_ddC[i][j] = m_operator[i].block(shift0,m_operator[i].cols()-m_shift[i][numPatch],m_shift[i][j],m_shift[i][numPatch]); + shift0 = shift0 + m_shift[i][j]; + } + shift0 = 0; + } + + m_A_aprox.resize(numLevels); + for(int i = 0 ; i < numLevels ; i++) + { + // Define the A_aprox matrix + m_A_aprox[i] = gsSparseMatrix(m_operator[i].rows(),m_operator[i].cols()); + + // Retrieve a block of each patch + for(int k=0; k< numPatch; k++) + { + m_A_aprox[i].block(shift0,shift0,m_shift[i][k],m_shift[i][k]) = m_ILUT[i][k]; + shift0 = shift0 + m_shift[i][k]; + } + shift0 = 0; + m_ddBtilde[i].resize(numPatch); + m_ddCtilde[i].resize(numPatch); + + for(int j=0 ; j < numPatch ; j ++) + { + m_ddBtilde[i][j] = gsSparseMatrix(m_shift[i][j],m_shift[i][numPatch]); + m_ddCtilde[i][j] = gsSparseMatrix(m_shift[i][j],m_shift[i][numPatch]); + for(int k=0 ; k < m_shift[i][numPatch]; k++) + { + gsMatrix Brhs = m_ddC[i][j].col(k); + gsMatrix Crhs = m_ddC[i][j].col(k); + m_ddBtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().transpose().solve(Brhs); + m_ddCtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().solve(Crhs); + } + } + + // Define matrix S + m_S[i] = m_ddC[i][numPatch]; + for(int l = 0 ; l < numPatch ; l++) + { + m_S[i] = m_S[i] - m_ddBtilde[i][l].transpose()*m_ddCtilde[i][l]; + } + + // Fill matrix A_aprox + for(int m = 0 ; m < numPatch ; m++) + { + m_A_aprox[i].block(shift0,m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][m],m_shift[i][numPatch]) = m_ddCtilde[i][m]; + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][m]) = m_ddBtilde[i][m].transpose(); + shift0 = shift0 + m_shift[i][m]; + } + shift0 = 0; + + // Perform ILUT on the S-matrix! + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + gsSparseMatrix II = m_S[i]; + ilu.compute(II); + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][numPatch],m_shift[i][numPatch]) = ilu.m_lu; + } + } + + real_t Time_Block_ILUT_Factorization = clock.stop(); + GISMO_UNUSED(Time_Block_ILUT_Factorization); + + // gsInfo << "\n|| Setup Timings || " < & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator) + { + gsStopwatch clock; + gsMatrix b = f; + + // Determine residual and L2 error + real_t r0 = (m_operator[numLevels-1]*x - b).norm(); + real_t r = r0; + int iter = 1; + + // Solve with p-multigrid method + real_t r_old = r0; + clock.restart(); + // Adjusted stopping criterion!! + while( r/b.norm() > Base::tol && iter < Base::maxIter ) + { + // Call solver from base class + Base::solveMG(b, m_bases, x, numLevels, + *m_bcInfo_ptr, *m_mp_ptr, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + + r = (m_operator[numLevels-1]*x - b).norm(); + if ( r_old < r) + { + gsInfo << "Residual increased during solving!!! " <& rhs, + gsMatrix& x, + const int& numLevels) + { + //Direct solver (LU factorization) + CoarseSolver solver; + solver.analyzePattern(m_operator[numLevels-1]); + solver.factorize(m_operator[numLevels-1]); + x = solver.solve(rhs); + } + + /// @brief Construct prolongation operator at level numLevels + virtual gsMatrix prolongation_M(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix M (high_order * high_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basesH ,1, 0); + w_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + w_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basesH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct prolongation operator at level numLevels + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix P (high_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + gsExprAssembler ex(1,1); + geometryMap G = ex.getMap(*m_mp_ptr); + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basesH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basesL); + u_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + v_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + u_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + //#u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basesH); + ex.initSystem(); + ex.assemble(u_n*meas(G) * v_n.tr()); + gsSparseMatrix P = ex.matrix().transpose(); + return P; + } + + /// @brief Construct restriction operator at level numLevels + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix M (low_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basesL ,1, 0); + w_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + w_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basesL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct restriction operator at level numLevels + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix P (high_order * low_order) + gsExprAssembler ex(1,1); + typedef gsExprAssembler::geometryMap geometryMap; + geometryMap G = ex.getMap(*m_mp_ptr); + + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basesH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basesL); + u_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + u_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + v_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + //#v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basesH); + ex.initSystem(); + ex.assemble(u_n * meas(G)* v_n.tr()); + gsSparseMatrix P = ex.matrix(); + return P; + } + + /// @brief Apply fixed number of presmoothing steps + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineRes, + const gsMatrix& hp) + { + //gsInfo << "Residual before presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if (Base::typeSmoother == 2) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + if (Base::typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if (Base::typeSmoother == 5) + { + if (hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + else + { + gsMatrix e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + } + // gsInfo << "Residual after presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels <& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineCorr, + gsMatrix & postRes, + const gsMatrix& hp) + { + real_t alpha = 1; + x = x - alpha*fineCorr; + //gsInfo << "Residual before postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if (Base::typeSmoother == 2) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + if (Base::typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if (Base::typeSmoother == 5) + { + if (hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + else + { + gsMatrix e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + postRes = rhs - m_operator[numLevels-1]*x; + // gsInfo << "Residual after postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels <::Ptr setupSubspaceCorrectedMassSmoother(const gsSparseMatrix<>& matrix, const gsMultiBasis<>& mb, const gsBoundaryConditions<>& bc, const gsOptionList& opt, const int &typeBCHandling) +{ + const short_t dim = mb.topology().dim(); + + // Setup dof mapper + gsDofMapper dm; + mb.getMapper( + typeBCHandling == 1 ? (dirichlet::strategy)opt.askInt("DirichletStrategy",11) : (dirichlet::strategy)opt.askInt("DirichletStrategy",14), + (iFace ::strategy)opt.askInt("InterfaceStrategy", 1), + bc, + dm, + 0 + ); + const index_t nTotalDofs = dm.freeSize(); + + // Decompose the whole domain into components + std::vector< std::vector > components = mb.topology().allComponents(true); + const index_t nr_components = components.size(); + + // Setup Dirichlet boundary conditions + gsBoundaryConditions<> dir_bc; + for( index_t ps=0; ps < 2*dim; ++ps ) + dir_bc.addCondition( 0, 1+ps, condition_type::dirichlet, NULL ); + + // Setup transfer matrices and local preconditioners + std::vector< gsSparseMatrix > transfers; + transfers.reserve(nr_components); + std::vector< gsLinearOperator<>::Ptr > ops; + ops.reserve(nr_components); + + for (index_t i=0; i indices; + std::vector::uPtr> bases = mb.componentBasis_withIndices(components[i],dm,indices,true); + index_t sz = indices.rows(); + gsSparseEntries<> se; + se.reserve(sz); + for (index_t i=0; i transfer(nTotalDofs,sz); + transfer.setFrom(se); + if (sz>0) + { + if (bases[0]->dim() == dim) + { + GISMO_ASSERT ( bases.size() == 1, "Only one basis is expected for each patch." ); + ops.push_back( + gsPatchPreconditionersCreator<>::subspaceCorrectedMassSmootherOp( + *(bases[0]), + dir_bc, + gsOptionList(), + opt.getReal("Scaling") + ) + ); + } + else + { + gsSparseMatrix<> mat = transfer.transpose() * matrix * transfer; + ops.push_back( makeSparseCholeskySolver(mat) ); + } + transfers.push_back(give(transfer)); + } + } + return gsPreconditionerFromOp<>::make(makeMatrixOp(matrix), gsAdditiveOp<>::make(transfers, ops)); +} + + +} // namespace gismo diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp new file mode 100644 index 0000000000..bb60bc8900 --- /dev/null +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -0,0 +1,594 @@ +/** @file xbraid_example.cpp + + @brief XBraid integration + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): A. Mantzaflaris, M. Moeller +*/ + +#include +#include +#include "gsXBraidMultigrid.h" + +using namespace gismo; + +#ifdef GISMO_WITH_XBRAID + +namespace gismo { + + enum class gsXBraid_typeMethod + { + FE_FE = 0, // forward Euler (all grids) + BE_BE = 1, // backward Euler (all grids) + CN_CN = 2, // Crank-Nicholson (all grids) + FE_BE = 3, // forward Euler (fine grid), backward Euler (coarser grids) + CN_BE = 4 // Crank-Nicholson (fine grid), backward Euler (coarser grids) + }; + +/** + \brief Derived class implementing the XBraid wrapper for the heat equation +*/ +template +class gsXBraid_app : public gsXBraid< gsMatrix > +{ +private: + // Spatial discretisation parameters + index_t numRefine, numElevate, numIncrease; + + // Temporal discretisation parameters + index_t numSteps, typeMethod; + T tstart, tstop, tstep; + + // Spatial discretizations + gsMultiPatch mp; + gsMultiBasis basesH, basesL; + + // Boundary conditions + gsBoundaryConditions bc; + + // Assembler options + gsOptionList Aopt, Sopt, Topt; + + // Expression assembler + gsExprAssembler K, M; + gsFunctionExpr f, u0, ms; + + // Solution + gsMatrix sol; + + // Multigrid solver + typedef gsXBraidMultigrid::LU > solver_mg; + std::vector< solver_mg* > m_solver; + + typedef typename gsExprAssembler::geometryMap geometryMap; + typedef typename gsExprAssembler::variable variable; + typedef typename gsExprAssembler::space space; + typedef typename gsExprAssembler::solution solution; + + public: + /// Contructor + gsXBraid_app(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + index_t typeMethod, + index_t numSteps, + index_t numRefine, + index_t numElevate, + index_t numIncrease, + std::string& fn) + : gsXBraid< gsMatrix >::gsXBraid(comm, tstart, tstop, (int)numSteps), + numRefine(numRefine), + numElevate(numElevate), + numIncrease(numIncrease), + numSteps(numSteps), + typeMethod(typeMethod), + tstart(tstart), + tstop(tstop), + tstep( (tstop-tstart)/numSteps ), + K(1,1), M(1,1) + { + ///////////////////////////////////////////////////////////////////////////////////////////// + // Code for heat equation starts here // + ///////////////////////////////////////////////////////////////////////////////////////////// + + gsFileData fd(fn); + if (this->id() == 0) gsInfo << "Loaded file " << fd.lastPath() << "\n"; + + fd.getId(0, mp); // id=0: Multipatch domain + basesH = gsMultiBasis(mp); + basesL = gsMultiBasis(mp); + + fd.getId(1, f); // id=1: right-hand side function + if (this->id() == 0) gsInfo << "Source function " << f << "\n"; + + fd.getId(2, bc); // id=2: boundary conditions + if (this->id() == 0) gsInfo << "Boundary conditions:\n" << bc << "\n"; + + fd.getId(3, u0); // id=3: initial conditions + if (this->id() == 0) gsInfo << "Initial conditions:\n" << u0 << "\n"; + + fd.getId(4, ms); // id=4: manufactured solution + if (this->id() == 0) gsInfo << "Manufactured solution:\n" << ms << "\n"; + + fd.getId(5, Aopt); // id=5: assembler options + if (this->id() == 0) gsInfo << "Assembler options:\n" << Aopt << "\n"; + K.setOptions(Aopt); + M.setOptions(Aopt); + + fd.getId(6, Topt); // id=6: multigrid-in-time options + if (this->id() == 0) gsInfo << "Multigrid-in-time options:\n" << Topt << "\n"; + + this->SetCFactor(Topt.getInt("CFactor")); + this->SetMaxIter(Topt.getInt("maxIter")); + this->SetMaxLevels(Topt.getInt("maxLevel")); + this->SetMaxRefinements(Topt.getInt("numMaxRef")); + this->SetMinCoarse(Topt.getInt("minCLevel")); + this->SetNFMG(Topt.getInt("numFMG")); + this->SetNFMGVcyc(Topt.getInt("numFMGVcyc")); + this->SetNRelax(Topt.getInt("numRelax")); + this->SetAccessLevel(Topt.getInt("access")); + this->SetPrintLevel(Topt.getInt("print")); + this->SetStorage(Topt.getInt("numStorage")); + this->SetTemporalNorm(Topt.getInt("norm")); + + if (Topt.getSwitch("fmg")) this->SetFMG(); + if (Topt.getSwitch("incrMaxLevels")) this->SetIncrMaxLevels(); + if (Topt.getSwitch("periodic")) this->SetPeriodic(1); else this->SetPeriodic(0); + if (Topt.getSwitch("refine")) this->SetRefine(1); else this->SetRefine(0); + if (Topt.getSwitch("sequential")) this->SetSeqSoln(1); else this->SetSeqSoln(0); + if (Topt.getSwitch("skip")) this->SetSkip(1); else this->SetSkip(0); + if (Topt.getSwitch("spatial")) this->SetSpatialCoarsenAndRefine(); + if (Topt.getSwitch("tol")) this->SetAbsTol(Topt.getReal("absTol")); + else this->SetRelTol(Topt.getReal("relTol")); + + fd.getId(7, Sopt); // id=6: spatial solver options + if (this->id() == 0) gsInfo << "Spatial solver options:\n" << Sopt << "\n"; + + std::string typeCoarsening = Sopt.getString("coarseStrategy"); + gsMatrix<> hp = gsMatrix<>::Zero(Sopt.getInt("numLevels")-1,1); + + // Read string from command line + real_t numRefH = 0; + real_t numRefP = 0; + real_t numRefZ = 0; + + // Convert input string to array + for( int i = 0; i < Sopt.getInt("numLevels")-1 ; ++i) + { + if( typeCoarsening[i] == 'h') + { + hp(i,0) = 1; + numRefH = numRefH + 1; + } + else if( typeCoarsening[i] == 'p') + { + hp(i,0) = 0; + numRefP = numRefP + 1; + } + else + { + hp(i,0) = 2; + numRefZ = numRefZ + 1; + } + } + + // Apply refinement in p for coarse level + if((numRefP + numRefZ) == numIncrease ) + { + basesL.degreeReduce(1); + } + else + { + basesL.degreeIncrease(numIncrease-numRefP-numRefZ-1); + } + + // Apply refinement in h for coarse and fine level + for (int i = 0; i < numRefine - numRefH - numRefZ; ++i) + { + basesL.uniformRefine(); + } + for (int i = 0; i < numRefine ; ++i) + { + basesH.uniformRefine(); + } + + // Apply refinement in p for fine level + basesH.degreeIncrease(numIncrease-1); + + // Set the bases + K.setIntegrationElements(basesH); + M.setIntegrationElements(basesH); + + // Set the geometry map + geometryMap G_K = K.getMap(mp); + geometryMap G_M = M.getMap(mp); + + // Set the discretization space + space u_K = K.getSpace(basesH); + space u_M = M.getSpace(basesH); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + + bc.setGeoMap(mp); + u_K.setup(bc, dirichlet::l2Projection, 0); + u_M.setup(bc, dirichlet::l2Projection, 0); + //#u_K.addBc( bc.get("Dirichlet") ); + //#u_M.addBc( bc.get("Dirichlet") ); + + // Set the source term + auto ff_K = K.getCoeff(f, G_K); + auto ff_M = M.getCoeff(f, G_M); + + // Initialize and assemble the system matrix + K.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + + // Initialize and assemble the mass matrix + M.initSystem(); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + // Enforce Neumann conditions to right-hand side + variable g_Neumann = K.getBdrFunction(); + K.assembleBdr(bc.get("Neumann"), u_K * g_Neumann.val() * nv(G_K).norm() ); + //#K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); + + // Determine MGRIT levels a priori + int numMGRITLevels = 1; + int StepsLevel = numSteps; + for(int i = 1 ; i < 10000; i++){ + StepsLevel = StepsLevel/Topt.getInt("CFactor"); + if(StepsLevel < Topt.getInt("minCLevel")) + break; + numMGRITLevels = numMGRITLevels + 1; + } + + m_solver.resize(numMGRITLevels); + real_t tstep_level = tstep; + for(int i = 0 ; i < numMGRITLevels ; i++) + { + m_solver[i] = new solver_mg(mp, basesL, bc); + m_solver[i]->setMaxIter(Sopt.getInt("maxIter")); + m_solver[i]->setTolerance(Sopt.getReal("tol")); + m_solver[i]->setNumLevels(Sopt.getInt("numLevels"),Sopt.getInt("projection"),numIncrease); + m_solver[i]->setNumSmoothing(Sopt.getInt("numSmoothing")); + m_solver[i]->setTypeBCHandling(Sopt.getInt("bcHandling")); + m_solver[i]->setTypeCycle_h(Sopt.getInt("cycle_h")); + m_solver[i]->setTypeCycle_p(Sopt.getInt("cycle_p")); + m_solver[i]->setTypeLumping(Sopt.getInt("lumping")); + m_solver[i]->setTypeProjection(Sopt.getInt("projection")); + m_solver[i]->setTypeSmoother(Sopt.getInt("smoother")); + m_solver[i]->setCoarsening(hp); + if(typeMethod > 2 && i == 0) + { + m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,typeMethod); + } + else + { + // Apple Backward Euler at coarser levels (FE_BE and CN_BE) + m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,1); + } + tstep_level = tstep_level*Topt.getInt("CFactor"); + } + + + if (this->id() == 0) { + + gsStopwatch clock; + clock.restart(); + + sol.setZero(M.numDofs(),1); + + switch((gsXBraid_typeMethod)typeMethod) { + case gsXBraid_typeMethod::FE_FE: + case gsXBraid_typeMethod::FE_BE: + // Forward Euler method + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*sol, + sol); + break; + + case gsXBraid_typeMethod::BE_BE: + // Backward Euler method + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix())*sol, sol); + break; + + case gsXBraid_typeMethod::CN_CN: + case gsXBraid_typeMethod::CN_BE: + // Crank-Nicholson method + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*sol, + sol); + break; + + default: + throw std::runtime_error("Unsupported time-stepping method"); + } + + gsInfo << "wall time = " << clock.stop() << "\n" + << "L2 norm of the solution = " << sol.norm() << "\n"; + // gsExprEvaluator ev(M); + // solution u_sol = M.getSolution(u_M, sol); + // variable u_ex = ev.getVariable(ms, G_M); + // T l2err = math::sqrt( ev.integral( (u_ex - u_sol).sqNorm() * meas(G_M) ) ); + // T h1err = l2err + + // math::sqrt(ev.integral( ( igrad(u_ex) - grad(u_sol)*jac(G_M).inv() ).sqNorm() * meas(G_M) )); + + // gsInfo << "L2 error of the solution = " << l2err << "\n" + // << "H1 error of the solution = " << h1err << std::flush; + } + + } + + /// Destructor + virtual ~gsXBraid_app() + { + + } + + /// Creates instance from command line argument + static inline gsXBraid_app create(const gsMpiComm& comm, + int argc, + char** argv) + { + // Problem parameters + std::string fn(XBRAID_DATA_DIR"pde/heat2d_square_ibvp1.xml"); + + // Spatial discretisation parameters + index_t numRefine = 2; + index_t numElevate = 0; + index_t numIncrease = 0; + + // Temporal discretisation parameters + index_t numSteps = 40; + index_t typeMethod = (index_t)gsXBraid_typeMethod::BE_BE; + T tfinal = 0.1; + + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); + + // Problem parameters + cmd.addString( "f", "file", "Input XML file", fn ); + + // Spatial discretisation parameters + cmd.addInt( "e", "degreeElevation", + "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); + cmd.addInt( "i", "degreeIncrease", + "Number of degree increase steps to perform before solving (0: equalize degree in all directions)", numIncrease ); + cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); + + // Temporal diescretisation parameters + cmd.addInt( "n", "numSteps", "Number of parallel-in-time steps", numSteps ); + cmd.addInt( "T", "typeMethod", "Time-stepping scheme", typeMethod); + cmd.addReal( "t", "tfinal", "Final time", tfinal ); + + cmd.getValues(argc,argv); + + // Create instance + gsXBraid_app app(comm, 0.0, tfinal, typeMethod, numSteps, numRefine, numElevate, numIncrease, fn); + + return app; + } + + /// Initializes a vector + braid_Int Init(braid_Real t, + braid_Vector *u_ptr) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + gsMatrix* u = new gsMatrix(M.numDofs(), 1); + + if (t != tstart) { + // Intermediate solution + u->setZero(M.numDofs()); + } else { + // Initial solution + u->setZero(M.numDofs()); + } + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } + + /// Performs a single step of the parallel-in-time multigrid + braid_Int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + gsMatrix* u_ptr = (gsMatrix*) u; + gsMatrix* ustop_ptr = (gsMatrix*) ustop; + + // XBraid forcing + if (fstop != NULL) { + gsMatrix* fstop_ptr = (gsMatrix*) fstop; + *u_ptr += *fstop_ptr; + } + + // Get time step information + std::pair time = + static_cast(status).timeInterval(); + T tstep(time.second - time.first); + + switch((gsXBraid_typeMethod)typeMethod) { + case gsXBraid_typeMethod::FE_FE: + // Forward Euler method (all grids) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); + break; + + case gsXBraid_typeMethod::FE_BE: + if (static_cast(status).level() == 0) { + // Forward Euler method (fine grid) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); + } else { + // Backward Euler method (coarse grids) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); + } + break; + + case gsXBraid_typeMethod::BE_BE: { + // Backward Euler method (all grids) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix())*(*u_ptr), *ustop_ptr); + } break; + + case gsXBraid_typeMethod::CN_CN: + // Crank-Nicholson method (all grids) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); + break; + + case gsXBraid_typeMethod::CN_BE: + if (static_cast(status).level() == 0) { + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); + } else { + // Backward Euler method (coarse grids) + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); + } + break; + + default: + throw std::runtime_error("Unsupported time-stepping method"); + } + + // Carry out adaptive refinement in time + if (static_cast(status).level() == 0) { + braid_Real error = static_cast(status).error(); + if (error != braid_Real(-1.0)) { + braid_Int rfactor = (braid_Int) std::ceil( std::sqrt( error / 1e-3) ); + status.SetRFactor(rfactor); + } else + status.SetRFactor(1); + } + + return braid_Int(0); + } + + /// Sets the size of the MPI communication buffer + braid_Int BufSize(braid_Int *size_ptr, + BraidBufferStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + *size_ptr = sizeof(T)*(M.numDofs()+2); + return braid_Int(0); + } + + /// Handles access for input/output + braid_Int Access(braid_Vector u, + BraidAccessStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + if (static_cast(status).done() && + static_cast(status).timeIndex() == + static_cast(status).times()) { + gsMatrix* u_ptr = (gsMatrix*) u; + gsInfo << "norm of the solution = " << u_ptr->norm() << std::endl; + } + return braid_Int(0); + } + + /// Performs spatial coarsening + braid_Int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + // gsInfo << "Coarsen on level = " + // << static_cast(status).level() + // << " of " + // << static_cast(status).levels() + // << "\n"; + gsMatrix *fu_ptr = (gsMatrix*) fu; + gsMatrix* cu = new gsMatrix(); + *cu = *fu_ptr; + *cu_ptr = (braid_Vector) cu; + return braid_Int(0); + } + + // Performs spatial refinement + braid_Int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { + // gsInfo << "Refine on level = " + // << static_cast(status).level() + // << " of " + // << static_cast(status).levels() + // << "\n"; + gsMatrix *cu_ptr = (gsMatrix*) cu; + gsMatrix* fu = new gsMatrix(); + *fu = *cu_ptr; + *fu_ptr = (braid_Vector) fu; + return braid_Int(0); + } +}; + +} // ending namespace gismo + +#endif + +int main(int argc, char**argv) +{ +#ifdef GISMO_WITH_XBRAID + + // Initialize the MPI environment and obtain the world communicator + gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); + + // Print MPI/OpenMP configuration + if (comm.rank() == 0) + { + gsInfo << "Number of MPI processes : " << comm.size() << std::endl; +#ifdef _OPENMP + gsInfo << "Number of OpenMP processes : " << omp_get_num_procs() << std::endl; +#endif + } + + // Set up app structure + gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); + + // Perform parallel-in-time multigrid + app.solve(); + +#else + + gsInfo << "\n"; + +#endif + + return 0; + +} diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml new file mode 100644 index 0000000000..f1de030b22 --- /dev/null +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -0,0 +1,130 @@ + + + + + + 100 100 + + 100 1 + 100 2 + 100 3 + 100 4 + + + + + 1 + + + + 0 + 1 + + + + 0 1 + 0 2 + 0 3 + + + + + 0 4 + + + + + 0 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + + + + diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml new file mode 100644 index 0000000000..4fd10a4898 --- /dev/null +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml @@ -0,0 +1,108 @@ + + + + + + 100 100 + + 100 1 + 100 2 + 100 3 + 100 4 + + + + + 1 + + + + 0 + 1 + + + + 0 1 0 2 0 3 + + + + + 0 4 + + + + + 0 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + + diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h new file mode 100644 index 0000000000..b3d9c3f236 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.h @@ -0,0 +1,902 @@ +/** @file gsXBraid.h + + @brief Provides declarations of the XBraid wrapper + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include + +#if !defined(GISMO_WITH_MPI) +#define braid_SEQUENTIAL 1 +#endif + +#include + +namespace gismo { + + class gsXBraidAccessStatus; + class gsXBraidSyncStatus; + class gsXBraidStepStatus; + class gsXBraidCoarsenRefStatus; + class gsXBraidBufferStatus; + class gsXBraidObjectiveStatus; + + /** + \brief Class defining the XBraid wrapper + + The gsXBraid class wraps the BraidApp class provided by the + XBraid project and adds a set of commodity functions. + + In order to implement an XBraid application the user has to + implement a derived class + + \code{.cpp} + template + class gsXBraid_app : public gsXBraid + { ... }; + \endcode + + and implement the following application-specific functions: + + \code{.cpp} + braid_Int Access(...) + braid_Int BufPack(...) + braid_Int BufSize(...) + braid_Int BufUnpack(...) + braid_Int Clone(...) + braid_Int Free(...) + braid_Int Init(...) + braid_Int Residual(...) + braid_Int SpatialNorm(...) + braid_Int Step(...) + \endcode + + which are declared as (pure) virtual functions in BraidApp. + + The generic implementation of the gsXBraid class leaves all of + these methods unimplemented. We also provide specializations for + gsXBraid> and gsXBraid> which assume that + the data type for storing the solution (passed as braid_Vector) + is of type gsMatrix and gsVector, respectively. + */ + + template + class gsXBraid : public BraidApp + { + public: + /// Constructor + gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); + + /// Destructor + virtual ~gsXBraid(); + + /// Frees the given vector (dummy method) + virtual braid_Int Free(braid_Vector) { return braid_Int(0); } + + /// Computes the residual (dummy method) + virtual braid_Int Residual(braid_Vector, braid_Vector, BraidStepStatus&) + { GISMO_NO_IMPLEMENTATION } + + /// Runs the parallel-in-time multigrid solver + void solve() { core.Drive(); } + + public: + /// Sets the maximum number of multigrid levels. + void SetMaxLevels(braid_Int max_levels) { core.SetMaxLevels(max_levels); } + + /// Increases the max number of multigrid levels after performing a refinement. + void SetIncrMaxLevels() { core.SetIncrMaxLevels(); } + + /// Sets whether to skip all work on the first down cycle (skip = 1). On by default. + void SetSkip(braid_Int skip) { core.SetSkip(skip); } + + /// Sets the minimum allowed coarse grid size. gsXBraid stops + /// coarsening whenever creating the next coarser grid will result + /// in a grid smaller than min_coarse. The maximum possible coarse + /// grid size will be min_coarse*coarsening_factor. + void SetMinCoarse(braid_Int min_coarse) { core.SetMinCoarse(min_coarse); } + + /// Sets the number of relaxation sweeps *nrelax* on grid + /// *level*. Level 0 is the finest grid. One sweep is a CF + /// relaxation sweep. + void SetNRelax(braid_Int level, braid_Int nrelax) { core.SetNRelax(level, nrelax); } + + /// Sets the number of relaxation sweeps *nrelax* on all grid + /// levels. One sweep is a CF relaxation sweep. + void SetNRelax(braid_Int nrelax) { core.SetNRelax(-1, nrelax); } + + /// Sets absolute stopping tolerance. + void SetAbsTol(braid_Real tol) { core.SetAbsTol(tol); } + + /// Sets relative stopping tolerance. + void SetRelTol(braid_Real tol) { core.SetRelTol(tol); } + + /// Sets the temporal norm: 1-norm (1), 2-norm (2:default), inf-norm (3) + void SetTemporalNorm(braid_Int tnorm) { core.SetTemporalNorm(tnorm); } + + /// Sets the coarsening factor *cfactor* on grid *level* (default is 2) + void SetCFactor(braid_Int level, braid_Int cfactor) { core.SetCFactor(level, cfactor); } + + /// Sets the coarsening factor *cfactor* on all grid levels + void SetCFactor(braid_Int cfactor) { core.SetCFactor(-1, cfactor); } + + /// Sets periodic time grid (default is 0) + void SetPeriodic(braid_Int periodic) { core.SetPeriodic(periodic); } + + /// Sets max number of multigrid iterations. + void SetMaxIter(braid_Int max_iter) { core.SetMaxIter(max_iter); } + + /// Sets the print level for runtime print message. + /// - Level 0: no output + /// - Level 1: print runtime information like the residual history + /// - Level 2: level 1 output, plus post-Braid run statistics (default) + /// - Level 3: level 2 output, plus debug level output. + void SetPrintLevel(braid_Int print_level) { core.SetPrintLevel(print_level); } + + /// Sets the output file for runtime print message. + void SetPrintFile(const char *printfile_name) { core.SetPrintFile(printfile_name); } + + /// Sets the initial guess to gsXBraid as the sequential time stepping solution. + /// - 0: The user's Init() function initializes the state vector (default) + /// - 1: Sequential time stepping, with the user's initial condition from + /// Init(t=0) initializes the state vector + void SetSeqSoln(braid_Int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } + + /// Sets the acces level for gsXBraid. This controls how often the + /// user's access routine is called. + /// - Level 0: Never call the user's access routine + /// - Level 1: Only call the user's access routine after gsXBraid is finished (default) + /// - Level 2: Call the user's access routine every iteration and on every level. + /// This is during _braid_FRestrict, during the down-cycle part of a + /// gsXBraid iteration. + void SetAccessLevel(braid_Int access_level) { core.SetAccessLevel(access_level); } + + /// Sets FMG (F-cycle) + void SetFMG() { core.SetFMG(); } + + /// Sets the number of initial F-cycles to do before switching to V-cycles + void SetNFMG(braid_Int k) { core.SetNFMG(k); } + + /// Sets the number of V-cycles to do at each FMG level (default is 1) + void SetNFMGVcyc(braid_Int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } + + /// Sets the storage properties of the code. + /// -1 : Default, store only C-points + /// 0 : Full storage of C- and F-Points on all levels + /// x > 0 : Full storage on all levels >= x + void SetStorage(braid_Int storage) { core.SetStorage(storage); } + + /// Turns time refinement on (refine = 1) or off (refine = 0). + void SetRefine(braid_Int refine) {core.SetRefine(refine);} + + /// Sets the max number of time grid refinement levels allowed. + void SetMaxRefinements(braid_Int max_refinements) {core.SetMaxRefinements(max_refinements);} + + /// Turns on built-in Richardson-based error estimation and/or + /// extrapolation with gsXBraid. When enabled, the Richardson + /// extrapolation (RE) option (richardson == 1) is used to improve + /// the accuracy of the solution at the C-points on the finest + /// level. When the built-in error estimate option is turned on + /// (est_error == 1), RE is used to estimate the local truncation + /// error at each point. These estimates can be accessed through + /// StepStatus and AccessStatus functions. The last parameter is + /// local_order, which represents the LOCAL order of the* time + /// integration scheme. e.g. local_order = 2 for Backward Euler. + /// Also, the Richardson error estimate is only available after + /// roughly 1 Braid iteration. The estimate is given a dummy value + /// of -1.0, until an actual estimate is available. Thus after an + /// adaptive refinement, and a new hierarchy is formed, another + /// iteration must pass before the error estimates are available + /// again. + void SetRichardsonEstimation(braid_Int est_error, braid_Int richardson, braid_Int local_order) + { core.SetRichardsonEstimation(est_error, richardson, local_order); } + + public: + /// Sets user-defined residual routine. + void SetResidual() { core.SetResidual(); } + + /// Sets user-defined coarsening and refinement routine. + void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } + + /// Sets user-defined sync routine. + void SetSync() { core.SetSync(); } + + /// Sets the default print file + void SetDefaultPrintFile() { core.SetDefaultPrintFile(); } + + /// Sets the file input/output level + void SetFileIOLevel(braid_Int io_level) { core.SetFileIOLevel(io_level); } + + /// Sets the C-relaxation weight + void SetCRelaxWt(braid_Int level, braid_Real Cwt) { core.SetCRelaxWt(level, Cwt); } + + /// Sets the time cutoff + void SetTPointsCutoff(braid_Int tpoints_cutoff) { core.SetTPointsCutoff(tpoints_cutoff); } + + /// Sets callback function for residual numer calculation + void SetFullRNormRes(braid_PtFcnResidual residual) { core.SetFullRNormRes(residual); } + + /// Sets callback function for time grid + void SetTimeGrid(braid_PtFcnTimeGrid tgrid) { core.SetTimeGrid(tgrid); } + + public: + /// Gets the number of iterations (XBraid style) + void GetNumIter(braid_Int *niter_ptr) { core.GetNumIter(niter_ptr); } + + /// Gets the residual norm (XBraid style) + void GetRNorms(braid_Int *nrequest_ptr, braid_Real *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } + + /// Gets the total number of levels (XBraid style) + void GetNLevels(braid_Int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + + /// Gets the MPI process ID + void GetMyID(braid_Int *myid_ptr) { core.GetMyID(myid_ptr); } + + /// Returns the number of iterations + braid_Int iterations() { + braid_Int niter; + GetNumIter(&niter); + return niter; + } + + /// Returns the residual norm + braid_Real norm(braid_Int nrequest) { + braid_Real rnorm; + GetRNorms(&nrequest, &rnorm); + return rnorm; + } + + /// Returns the total number of levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the MPI process ID + braid_Int id() { + braid_Int myid; + GetMyID(&myid); + return myid; + } + + protected: + /// Braid Core object + BraidCore core; + }; + + + /** + \brief Specializations for gsXBraid> + */ + template + class gsXBraid< gsMatrix > : public gsXBraid + { + public: + /// Constructor + gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); + + /// Destructor + virtual ~gsXBraid(); + + /// Clones the given vector + virtual braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) + { + gsMatrix* u_ptr = (gsMatrix*) u; + gsMatrix* v = new gsMatrix(); + *v = *u_ptr; + *v_ptr = (braid_Vector) v; + return braid_Int(0); + } + + /// Frees the given vector + virtual braid_Int Free(braid_Vector u) + { + gsMatrix* u_ptr = (gsMatrix*) u; + delete u_ptr; + return braid_Int(0); + } + + /// Computes the sum of two given vectors + virtual braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) + { + gsMatrix* x_ptr = (gsMatrix*) x; + gsMatrix* y_ptr = (gsMatrix*) y; + *y_ptr = (T)alpha * (*x_ptr) + (T)beta * (*y_ptr); + return braid_Int(0); + } + + /// Computes the spatial norm of the given vector + virtual braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) + { + gsMatrix *u_ptr = (gsMatrix*) u; + *norm_ptr = u_ptr->norm(); + return braid_Int(0); + } + + /// Packs the given vector into the MPI communication buffer + virtual braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) + { + gsMatrix *u_ptr = (gsMatrix*) u; + T* buffer_ptr = (T*) buffer; + T* data_ptr = u_ptr->data(); + index_t size = u_ptr->rows()*u_ptr->cols(); + + buffer_ptr[0] = u_ptr->rows(); + buffer_ptr[1] = u_ptr->cols(); + for (index_t idx = 0; idx < size; ++idx) + buffer_ptr[idx+2] = data_ptr[idx]; + + status.SetSize(sizeof(T)*(size+2)); + return braid_Int(0); + } + + /// Unpacks a vector from the MPI communication buffer + virtual braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) + { + T* buffer_ptr = (T*) buffer; + index_t rows = buffer_ptr[0]; + index_t cols = buffer_ptr[1]; + gsMatrix* u = new gsMatrix(rows,cols); + T* data_ptr = u->data(); + + for (index_t idx = 0; idx < rows*cols; ++idx) + data_ptr[idx] = buffer_ptr[idx+2]; + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } + }; + + /** + \brief Specializations for gsXBraid> + */ + template + class gsXBraid< gsVector > : public gsXBraid + { + public: + /// Constructor + gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); + + /// Destructor + virtual ~gsXBraid(); + + /// Clones the given vector + virtual braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) + { + gsVector* u_ptr = (gsVector*) u; + gsVector* v = new gsVector(); + *v = *u_ptr; + *v_ptr = (braid_Vector) v; + return braid_Int(0); + } + + /// Frees the given vector + virtual braid_Int Free(braid_Vector u) + { + gsVector* u_ptr = (gsVector*) u; + delete u_ptr; + return braid_Int(0); + } + + /// Computes the sum of two given vectors + virtual braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) + { + gsVector* x_ptr = (gsVector*) x; + gsVector* y_ptr = (gsVector*) y; + *y_ptr = (T)alpha * (*x_ptr) + (T)beta * (*y_ptr); + return braid_Int(0); + } + + /// Computes the spatial norm of the given vector + virtual braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) + { + gsVector *u_ptr = (gsVector*) u; + *norm_ptr = u_ptr->norm(); + return braid_Int(0); + } + + /// Packs the given vector into the MPI communication buffer + virtual braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) + { + gsVector *u_ptr = (gsVector*) u; + T* buffer_ptr = (T*) buffer; + T* data_ptr = u_ptr->data(); + index_t size = u_ptr->size(); + + buffer_ptr[0] = u_ptr->size(); + for (index_t idx = 0; idx < size; ++idx) + buffer_ptr[idx+1] = data_ptr[idx]; + + status.SetSize(sizeof(T)*(size+1)); + return braid_Int(0); + } + + /// Unpacks a vector from the MPI communication buffer + virtual braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) + { + T* buffer_ptr = (T*) buffer; + index_t size = buffer_ptr[0]; + gsVector* u = new gsVector(size); + T* data_ptr = u->data(); + + for (index_t idx = 0; idx < size; ++idx) + data_ptr[idx] = buffer_ptr[idx+1]; + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } + }; + + /** + \brief Class defining the XBraid access status wrapper + + The wrapper provides all functionality of the BraidAccessStatus + class plus some functions that return the information by value + */ + class gsXBraidAccessStatus : public BraidAccessStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns true if XBraid has completed + bool done() { + braid_Int status; + GetDone(&status); + return bool(status); + } + + /// ??? + braid_Int callingFunction() { + braid_Int callingfcn; + GetCallingFunction(&callingfcn); + return callingfcn; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// ??? + braid_Int test() { + braid_Int wtest; + GetWrapperTest(&wtest); + return wtest; + } + + /// Returns the residual norm + braid_Real norm() { + braid_Real rnorm; + GetResidual(&rnorm); + return rnorm; + } + + /// Returns the estimated error + braid_Real error() { + braid_Real errorest; + GetSingleErrorEstAccess(&errorest); + return errorest; + } + }; + + /** + \brief Class defining the XBraid sync status wrapper + + The wrapper provides all functionality of the BraidSyncStatus + class plus some functions that return the information by value + */ + class gsXBraidSyncStatus : public BraidSyncStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns true if XBraid is completed + bool done() { + braid_Int status; + GetDone(&status); + return bool(status); + } + + /// ??? + braid_Int callingFunction() { + braid_Int callingfcn; + GetCallingFunction(&callingfcn); + return callingfcn; + } + + /// Returns the estimated errors + braid_Real errors() { + braid_Real errorest; + GetAllErrorEst(&errorest); + return errorest; + } + + /// Returns the number of estimated errors + braid_Int nerrors() { + braid_Int numerrorest; + GetNumErrorEst(&numerrorest); + return numerrorest; + } + }; + + /** + \brief Class defining the XBraid step status wrapper + + The wrapper provides all functionality of the BraidStepStatus + class plus some functions that return the information by value + */ + class gsXBraidStepStatus : public BraidStepStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the end of the time interval + braid_Real timeStop() { + braid_Real t; + GetTstop(&t); + return t; + } + + /// Returns the time interval + std::pair timeInterval() { + std::pair t; + GetTstartTstop(&t.first, &t.second); + return t; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// Returns the tolerance + braid_Real tol() { + braid_Real t; + GetTol(&t); + return t; + } + + /// Returns the old tolerence for the fine-grid solver + braid_Real tolFine() { + braid_Real t; + GetOldFineTolx(&t); + return t; + } + + /// Returns the estimated error + braid_Real error() { + braid_Real errorest; + GetSingleErrorEstStep(&errorest); + return errorest; + } + + /// Returns the spatial accuracy + braid_Real accuracy(braid_Real loose_tol, braid_Real tight_tol) { + braid_Real tol; + GetSpatialAccuracy(loose_tol, tight_tol, &tol); + return tol; + } + }; + + /** + \brief Class defining the XBraid coarsen and refinement status wrapper + + The wrapper provides all functionality of the BraidCoarsenRefStatus + class plus some functions that return the information by value + */ + class gsXBraidCoarsenRefStatus : public BraidCoarsenRefStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// Returns the end of the fine time interval + braid_Real ftimeStop() { + braid_Real t; + GetFTstop(&t); + return t; + } + + /// Returns the start of the fine time interval + braid_Real ftimeStart() { + braid_Real t; + GetFTprior(&t); + return t; + } + + /// Returns the end of the coarse time interval + braid_Real ctimeStop() { + braid_Real t; + GetCTstop(&t); + return t; + } + + /// Returns the start of the coarse time interval + braid_Real ctimeStart() { + braid_Real t; + GetCTprior(&t); + return t; + } + }; + + /** + \brief Class defining the XBraid buffer status wrapper + + The wrapper provides all functionality of the BraidBufferStatus + class plus some functions that return the information by value + */ + class gsXBraidBufferStatus : public BraidBufferStatus + { + public: + /// Returns the message type + braid_Int type() { + braid_Int msg; + GetMessageType(&msg); + return msg; + } + }; + + /** + \brief Class defining the XBraid step objective wrapper + + The wrapper provides all functionality of the BraidObjectiveStatus + class plus some functions that return the information by value + */ + class gsXBraidObjectiveStatus : public BraidObjectiveStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + }; + +}// namespace gismo + +#ifndef GISMO_BUILD_LIB +#include GISMO_HPP_HEADER(gsXBraid.hpp) +#endif diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp new file mode 100644 index 0000000000..8e806b5f6f --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -0,0 +1,63 @@ +/** @file gsXBraid.hpp + + @brief Provides implementations of the XBraid wrapper. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include + +namespace gismo { + + // Constructor + template + gsXBraid::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : BraidApp(static_cast(comm), tstart, tstop, ntime), + core(static_cast(comm), this) + {} + + // Destructor + template + gsXBraid::~gsXBraid() + {} + + // Constructor + template + gsXBraid< gsMatrix >::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : gsXBraid(comm, tstart, tstop, ntime) + {} + + // Destructor + template + gsXBraid< gsMatrix >::~gsXBraid() + {} + + // Constructor + template + gsXBraid< gsVector >::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : gsXBraid(comm, tstart, tstop, ntime) + {} + + // Destructor + template + gsXBraid< gsVector >::~gsXBraid() + {} + +}// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp new file mode 100644 index 0000000000..97c0d37495 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -0,0 +1,13 @@ + +#include +#include +#include + +namespace gismo +{ + + CLASS_TEMPLATE_INST gsXBraid; + CLASS_TEMPLATE_INST gsXBraid< gsMatrix >; + CLASS_TEMPLATE_INST gsXBraid< gsVector >; + +} diff --git a/external/Eigen/src/Core/arch/NEON/Complex.h b/external/Eigen/src/Core/arch/NEON/Complex.h index f40af7f87f..3d575590b8 100644 --- a/external/Eigen/src/Core/arch/NEON/Complex.h +++ b/external/Eigen/src/Core/arch/NEON/Complex.h @@ -18,7 +18,7 @@ namespace internal { inline uint32x4_t p4ui_CONJ_XOR() { // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CRAY || defined(__CLANG_FUJITSU) uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; return ret; #else @@ -390,7 +390,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CRAY || defined(__CLANG_FUJITSU) static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; diff --git a/external/Eigen/src/Core/util/DisableStupidWarnings.h b/external/Eigen/src/Core/util/DisableStupidWarnings.h index fe0cfec0bc..0bf08bd8b5 100755 --- a/external/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/external/Eigen/src/Core/util/DisableStupidWarnings.h @@ -53,7 +53,7 @@ #pragma clang diagnostic ignored "-Wc11-extensions" #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined __FUJITSU #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push diff --git a/external/Eigen/src/Core/util/Macros.h b/external/Eigen/src/Core/util/Macros.h index 986c3d44db..b4c358ce94 100644 --- a/external/Eigen/src/Core/util/Macros.h +++ b/external/Eigen/src/Core/util/Macros.h @@ -193,9 +193,23 @@ #define EIGEN_COMP_EMSCRIPTEN 0 #endif +/// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler +#if defined(__FUJITSU) || defined(__CLANG_FUJITSU) + #define EIGEN_COMP_FCC (__FCC_major__*100+__FCC_minor__*10+__FCC_patchlevel__) +#else + #define EIGEN_COMP_FCC 0 +#endif + +/// \internal EIGEN_COMP_CRAY set to CARY version if the compiler is Cray Compiler +#if defined(_CRAYC) + #define EIGEN_COMP_CRAY (_RELEASE_MAJOR*100+_RELEASE_MINOR*10+_RELEASE_PATCHLEVEL) +#else + #define EIGEN_COMP_CRAY 0 +#endif + /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CRAY) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 @@ -932,7 +946,7 @@ #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE #endif -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && !defined(EIGEN_COMP_CRAY) #define EIGEN_DONT_INLINE __attribute__((noinline)) #elif EIGEN_COMP_MSVC #define EIGEN_DONT_INLINE __declspec(noinline) diff --git a/external/Eigen/src/Core/util/ReenableStupidWarnings.h b/external/Eigen/src/Core/util/ReenableStupidWarnings.h index 1ce6fd1b00..8e02ba4866 100644 --- a/external/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/external/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -12,7 +12,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) + #elif defined __GNUC__ && !defined __FUJITSU && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif diff --git a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index a5e4383df7..155b3f5900 100644 --- a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -197,14 +197,20 @@ class IncompleteLUT : public SparseSolverBase m_P; // Fill-reducing permutation PermutationMatrix m_Pinv; // Inverse permutation }; diff --git a/src/gismo.h b/src/gismo.h index 359ceb76f0..c7ef564934 100644 --- a/src/gismo.h +++ b/src/gismo.h @@ -76,6 +76,8 @@ namespace internal #include +#include + // #include // included by gsForwardDeclarations -> gsMemory // Tensors @@ -183,6 +185,7 @@ namespace internal /* ----------- IO ----------- */ #include +#include #include #include #include @@ -192,8 +195,8 @@ namespace internal #include #include -/* ----------- MPI ----------- */ -#include +/* ----------- Parallel ----------- */ +#include /* ----------- Utilities ----------- */ //#include - in gsForwardDeclarations.h diff --git a/src/gsAssembler/gsPatchRule.hpp b/src/gsAssembler/gsPatchRule.hpp index 7da94ffdbd..6037f91f7a 100644 --- a/src/gsAssembler/gsPatchRule.hpp +++ b/src/gsAssembler/gsPatchRule.hpp @@ -37,7 +37,7 @@ gsPatchRule::gsPatchRule(const gsBasis & basis, // Initialize some stuff m_dim = m_basis->dim(); - GISMO_ASSERT( m_fixDir < short_t(m_dim) && m_fixDir>-2, "Invalid input fixDir = "<-2, "Invalid input fixDir = "<::gsPatchRule(const gsBasis & basis, for (size_t d = 0; d != m_dim; d++) { m_end = m_basis->support().col(1); - if (short_t(d)==m_fixDir && m_fixDir!=-1) + if ((short_t)(d)==m_fixDir && m_fixDir!=-1) { m_nodes[d].resize(2); m_nodes[d]<<0,1; diff --git a/src/gsCore/gsConfig.h.in b/src/gsCore/gsConfig.h.in index a8be30fbe7..2dce25cc1b 100644 --- a/src/gsCore/gsConfig.h.in +++ b/src/gsCore/gsConfig.h.in @@ -29,10 +29,16 @@ #endif /** Define default index type. */ -#define index_t @GISMO_INDEX_TYPE@ +#define GISMO_INDEX_TYPE @GISMO_INDEX_TYPE@ +#ifndef index_t +#define index_t GISMO_INDEX_TYPE +#endif /** Define default dimension type. */ -#define short_t @GISMO_INDEX_TYPE@ //short +#define GISMO_SHORT_TYPE @GISMO_SHORT_TYPE@ +#ifndef short_t +#define short_t GISMO_SHORT_TYPE +#endif /** Define the file data directory. */ #define GISMO_DATA_DIR "@GISMO_DATA_DIR@" @@ -61,6 +67,7 @@ #cmakedefine GISMO_WITH_TRILINOS #cmakedefine GISMO_WITH_UMFPACK #cmakedefine GISMO_WITH_UNUM +#cmakedefine GISMO_WITH_XBRAID /* Only include new types here that can be set as real_t */ diff --git a/src/gsCore/gsDebug.h b/src/gsCore/gsDebug.h index 723fa95896..3089541a12 100644 --- a/src/gsCore/gsDebug.h +++ b/src/gsCore/gsDebug.h @@ -234,10 +234,11 @@ static const int gismo_set_abort_behavior = _set_abort_behavior( #pragma clang diagnostic ignored "-Wconstant-logical-operand" #pragma clang diagnostic ignored "-Wbind-to-temporary-copy" -#elif defined __GNUC__ // major version >=4 +#elif defined __GNUC__ && !defined __FUJITSU // major version >=4 // typedef locally defined but not used [-Wunused-local-typedefs] #if ( __GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>7) ) #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#pragma GCC diagnostic ignored "-Wclass-memaccess" #endif #if (__cplusplus < 201703L && __GNUC__>6) diff --git a/src/gsCore/gsForwardDeclarations.h b/src/gsCore/gsForwardDeclarations.h index b0636b6ebd..09f31cac65 100644 --- a/src/gsCore/gsForwardDeclarations.h +++ b/src/gsCore/gsForwardDeclarations.h @@ -15,6 +15,7 @@ // STD includes #include +#include #include #include #include diff --git a/src/gsCore/gsJITCompiler.h b/src/gsCore/gsJITCompiler.h index af7f18ccae..ca2371dcb9 100644 --- a/src/gsCore/gsJITCompiler.h +++ b/src/gsCore/gsJITCompiler.h @@ -16,8 +16,9 @@ #pragma once -#include +#include #include +#include #if defined(_WIN32) #include @@ -27,6 +28,8 @@ #include +#include + namespace gismo { /** diff --git a/src/gsCore/gsMath.h b/src/gsCore/gsMath.h index d4dad58c0c..0a106e3957 100644 --- a/src/gsCore/gsMath.h +++ b/src/gsCore/gsMath.h @@ -429,6 +429,16 @@ bool almostEqual(const T a, const T b) // static const double _2_pi = 0.636619772367581343076; // static const double _180_pi = 57.295779513082320876798; +// Maximum over three or more arguments +template +typename std::common_type::type max(const T a, const T b, const T c, const Ts... args) +{ return math::max(a, math::max(b,c,args...)); } + +// Minimum over three or more arguments +template +typename std::common_type::type min(const T a, const T b, const T c, const Ts... args) +{ return math::min(a, math::min(b,c,args...)); } + } //end namespace math /** diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp new file mode 100644 index 0000000000..8c075778a8 --- /dev/null +++ b/src/gsCore/gsSysInfo.cpp @@ -0,0 +1,715 @@ +/** @file gsSysInfo.cpp + + @brief Provides implemementation of system information. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include +#include + +#include + +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ || __unix__ +# include +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) +# include +# else +# include +# endif +#endif + +namespace gismo +{ + + std::string gsSysInfo::getGismoVersion() + { + return util::to_string(GISMO_VERSION); + } + + std::string gsSysInfo::getEigenVersion() + { + return util::to_string(EIGEN_WORLD_VERSION)+"." + + util::to_string(EIGEN_MAJOR_VERSION)+"." + + util::to_string(EIGEN_MINOR_VERSION); + } + + std::string gsSysInfo::getCompilerVersion() + { + // This code is copied from the CMakeCXXCompilerId.cpp file that was + // automatically generated with CMake 3.21.4 + + // The following two macros have been modified as we do not want to + // return the compiler version in the specific CMake format +#define DEC(n) n +#define HEX(n) n + + /* Version number components: V=Version, R=Revision, P=Patch + Version date components: YYYY=Year, MM=Month, DD=Day */ + +#if defined(__COMO__) +# define COMPILER_ID "Comeau" + /* __COMO_VERSION__ = VRR */ +# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100) +# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100) + +#elif defined(__INTEL_COMPILER) || defined(__ICC) +# define COMPILER_ID "Intel" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# if defined(__GNUC__) +# define SIMULATE_ID "GNU" +# endif + /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later, + except that a few beta releases use the old format with V=2021. */ +# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111 +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) +# if defined(__INTEL_COMPILER_UPDATE) +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) +# else +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) +# endif +# else +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE) + /* The third version component from --version is an update index, + but no macro is provided for it. */ +# define COMPILER_VERSION_PATCH DEC(0) +# endif +# if defined(__INTEL_COMPILER_BUILD_DATE) + /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ +# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) +# endif +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif +# if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) +# elif defined(__GNUG__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +# endif +# if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +# endif +# if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +# endif + +#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER) +# define COMPILER_ID "IntelLLVM" +#if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +#endif +#if defined(__GNUC__) +# define SIMULATE_ID "GNU" +#endif + /* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and + * later. Look for 6 digit vs. 8 digit version number to decide encoding. + * VVVV is no smaller than the current year when a version is released. + */ +#if __INTEL_LLVM_COMPILER < 1000000L +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10) +#else +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100) +#endif +#if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +#endif +#if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) +#elif defined(__GNUG__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +#endif +#if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +#endif +#if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +#endif + +#elif defined(__PATHCC__) +# define COMPILER_ID "PathScale" +# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) +# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) +# if defined(__PATHCC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) +# endif + +#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) +# define COMPILER_ID "Embarcadero" +# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) +# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) +# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) + +#elif defined(__BORLANDC__) +# define COMPILER_ID "Borland" + /* __BORLANDC__ = 0xVRR */ +# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) +# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) + +#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 +# define COMPILER_ID "Watcom" + /* __WATCOMC__ = VVRR */ +# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__WATCOMC__) +# define COMPILER_ID "OpenWatcom" + /* __WATCOMC__ = VVRP + 1100 */ +# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__SUNPRO_CC) +# define COMPILER_ID "SunPro" +# if __SUNPRO_CC >= 0x5100 + /* __SUNPRO_CC = 0xVRRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# else + /* __SUNPRO_CC = 0xVRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# endif + +#elif defined(__HP_aCC) +# define COMPILER_ID "HP" + /* __HP_aCC = VVRRPP */ +# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000) +# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__HP_aCC % 100) + +#elif defined(__DECCXX) +# define COMPILER_ID "Compaq" + /* __DECCXX_VER = VVRRTPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000) +# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000 % 100) +# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER % 10000) + +#elif defined(__IBMCPP__) && defined(__COMPILER_VER__) +# define COMPILER_ID "zOS" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__ibmxl__) && defined(__clang__) +# define COMPILER_ID "XLClang" +# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__) +# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__) +# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__) +# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__) + + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800 +# define COMPILER_ID "XL" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800 +# define COMPILER_ID "VisualAge" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__NVCOMPILER) +# define COMPILER_ID "NVHPC" +# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__) +# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__) +# if defined(__NVCOMPILER_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__) +# endif + +#elif defined(__PGI) +# define COMPILER_ID "PGI" +# define COMPILER_VERSION_MAJOR DEC(__PGIC__) +# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) +# if defined(__PGIC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) +# endif + +#elif defined(_CRAYC) +# define COMPILER_ID "Cray" +# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) +# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) + +#elif defined(__TI_COMPILER_VERSION__) +# define COMPILER_ID "TI" + /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ +# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) +# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) +# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) + +#elif defined(__CLANG_FUJITSU) +# define COMPILER_ID "FujitsuClang" +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# define COMPILER_VERSION_INTERNAL_STR __clang_version__ + + +#elif defined(__FUJITSU) +# define COMPILER_ID "Fujitsu" +# if defined(__FCC_version__) +# define COMPILER_VERSION __FCC_version__ +# elif defined(__FCC_major__) +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# endif +# if defined(__fcc_version) +# define COMPILER_VERSION_INTERNAL DEC(__fcc_version) +# elif defined(__FCC_VERSION) +# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION) +# endif + + +#elif defined(__ghs__) +# define COMPILER_ID "GHS" + /* __GHS_VERSION_NUMBER = VVVVRP */ +# ifdef __GHS_VERSION_NUMBER +# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100) +# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10) +# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10) +# endif + +#elif defined(__SCO_VERSION__) +# define COMPILER_ID "SCO" + +#elif defined(__ARMCC_VERSION) && !defined(__clang__) +# define COMPILER_ID "ARMCC" +#if __ARMCC_VERSION >= 1000000 + /* __ARMCC_VERSION = VRRPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) +# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) +# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) +#else + /* __ARMCC_VERSION = VRPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) +# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) +# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) +#endif + + +#elif defined(__clang__) && defined(__apple_build_version__) +# define COMPILER_ID "AppleClang" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# define COMPILER_VERSION_MAJOR DEC(__clang_major__) +# define COMPILER_VERSION_MINOR DEC(__clang_minor__) +# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif +# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__) + +#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION) +# define COMPILER_ID "ARMClang" +# define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000) +# define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100) +# define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION % 10000) +# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION) + +#elif defined(__clang__) +# define COMPILER_ID "Clang" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# define COMPILER_VERSION_MAJOR DEC(__clang_major__) +# define COMPILER_VERSION_MINOR DEC(__clang_minor__) +# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif + +#elif defined(__GNUC__) || defined(__GNUG__) +# define COMPILER_ID "GNU" +# if defined(__GNUC__) +# define COMPILER_VERSION_MAJOR DEC(__GNUC__) +# else +# define COMPILER_VERSION_MAJOR DEC(__GNUG__) +# endif +# if defined(__GNUC_MINOR__) +# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__) +# endif +# if defined(__GNUC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +# endif + +#elif defined(_MSC_VER) +# define COMPILER_ID "MSVC" + /* _MSC_VER = VVRR */ +# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100) +# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100) +# if defined(_MSC_FULL_VER) +# if _MSC_VER >= 1400 + /* _MSC_FULL_VER = VVRRPPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) +# else + /* _MSC_FULL_VER = VVRRPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) +# endif +# endif +# if defined(_MSC_BUILD) +# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) +# endif + +#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) +# define COMPILER_ID "ADSP" +#if defined(__VISUALDSPVERSION__) + /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ +# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) +# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) +#endif + +#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC) +# define COMPILER_ID "IAR" +# if defined(__VER__) && defined(__ICCARM__) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000) +# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000) +# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__)) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100) +# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100)) +# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# endif + + + /* These compilers are either not known or too old to define an + identification macro. Try to identify the platform and guess that + it is the native compiler. */ +#elif defined(__hpux) || defined(__hpua) +# define COMPILER_ID "HP" + +#else /* unknown compiler */ +# define COMPILER_ID "Unknown-Compiler" +#endif + + return util::to_string(COMPILER_ID) +#ifdef COMPILER_VERSION + +" "+util::to_string(COMPILER_VERSION); +#elif defined(COMPILER_VERSION_MAJOR) + +" "+util::to_string(COMPILER_VERSION_MAJOR) +# ifdef COMPILER_VERSION_MINOR + +"."+util::to_string(COMPILER_VERSION_MINOR) +# ifdef COMPILER_VERSION_PATCH + +"."+util::to_string(COMPILER_VERSION_PATCH) +# ifdef COMPILER_VERSION_TWEAK + +"."+util::to_string(COMPILER_VERSION_TWEAK) +# endif +# endif +# endif + ; +#endif + +#undef DEC +#undef HEX +#undef COMPILER_ID +#undef COMPILER_VERSION +#undef COMPILER_VERSION_MAJOR +#undef COMPILER_VERSION_MINOR +#undef COMPILER_VERSION_PATCH +#undef COMPILER_VERSION_TWEAK +#undef SIMULATE_VERSION_MAJOR +#undef SIMULATE_VERSION_MINOR +#undef SIMULATE_VERSION_PATCH +#undef SIMULATE_VERSION_TWEAK + } + + std::string gsSysInfo::getCppVersion() + { +#if defined(_MSC_VER) && _MSC_VER < 1600 + return "C++ 199711L"; +#elsif _MSC_VER >= 1900 + return "C++ "+util::to_string(_MSVC_LANG); +#elsif _MSC_VER >= 1600 + return "C++ 201103L"; +#else + return "C++ "+util::to_string(__cplusplus); +#endif + } + + std::string gsSysInfo::getStdLibVersion() + { +#ifdef _LIBCPP_VERSION + return "libc++ "+util::to_string(_LIBCPP_VERSION); +# elif defined(__GLIBCXX__) + return "glibc++ "+util::to_string(__GLIBCXX__); +# elif defined(__GLIBCPP__) + return "glibc++ "+util::to_string(__GLIBCPP__); +#elif defined(__LIBCOMO__) + return "Comeau STL "+util::to_string(__LIBCOMO__); +# elif defined(__STL_CONFIG_H) + return "SGI STL"; +# elif defined(__MSL_CPP__) + return "MSL standard lib"; +# elif defined(__IBMCPP__) + return "VACPP STL"; +# elif defined(MSIPL_COMPILE_H) + return "Modena C++ STL"; +# elif (defined(_YVALS) && !defined(__IBMCPP__)) || defined(_CPPLIB_VER) + return "Dinkumware STL "+util::to_string(_CPPLIB_VER); +# elif defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER) + return "Rogue Wave lib "+util::to_string(_RWSTD_VER); +#else + return "Unknown-STD"; +#endif + } + + std::string gsSysInfo::getExtraLibsVersion() + { + std::string s(""); + + // CoDiPack extension +#if defined(CODI_VERSION) + if (!s.empty()) s+= ", "; + s += "CoDiPack "+util::to_string(CODI_VERSION); +#elif defined(CODI_MAJOR_VERSION) && \ + defined(CODI_MINOR_VERSION) && \ + defined(CODI_BUILD_VERSION) + if (!s.empty()) s+= ", "; + s += "CoDiPack "+util::to_string(CODI_MAJOR_VERSION) + + "."+util::to_string(CODI_MINOR_VERSION) + + "."+util::to_string(CODI_BUILD_VERSION); +#endif + + // GMP library +#if defined(__GNU_MP_VERSION) && \ + defined(__GNU_MP_VERSION_MINOR) && \ + defined(__GNU_MP_VERSION_PATCHLEVEL) + if (!s.empty()) s+= ", "; + s += "gmp "+util::to_string(__GNU_MP_VERSION) + + "."+util::to_string(__GNU_MP_VERSION_MINOR) + + "."+util::to_string(__GNU_MP_VERSION_PATCHLEVEL); +#endif + + // IpOpt library +#if defined(IPOPT_VERSION) + if (!s.empty()) s+= ", "; + s += "IpOpt "+util::to_string(IPOPT_VERSION); +#elif defined(IPOPT_VERSION_MAJOR) && \ + defined(IPOPT_VERSION_MINOR) && \ + defined(IPOPT_VERSION_RELEASE) + if (!s.empty()) s+= ", "; + s += "IpOpt "+util::to_string(IPOPT_VERSION_MAJOR) + + "."+util::to_string(IPOPT_VERSION_MINOR) + + "."+util::to_string(IPOPT_VERSION_RELEASE); +#endif + + // Intel MKL library +#if defined(INTEL_MKL_VERSION) + if (!s.empty()) s+= ", "; + s += "MKL "+util::to_string(INTEL_MKL_VERSION); +#endif + + // MPFR library +#if defined(MPFR_VERSION_STRING) + if (!s.empty()) s+= ", "; + s += "mpfr "+util::to_string(MPFR_VERSION_STRING); +#elif defined(MPFR_VERSION_MAJOR) && \ + defined(MPFR_VERSION_MINOR) && \ + defined(MPFR_VERSION_PATCHLEVEL) + if (!s.empty()) s+= ", "; + s += "mpfr "+util::to_string(MPFR_VERSION_MAJOR) + + "."+util::to_string(MPFR_VERSION_MINOR) + + "."+util::to_string(MPFR_VERSION_PATCHLEVEL); +#endif + + // OpenCascade +#if defined(OCC_VERSION_COMPLETE) + if (!s.empty()) s+= ", "; + s += "occ "+util::to_string(OCC_VERSION_COMPLETE); +#elif defined(OCC_VERSION_MAJOR) && \ + defined(OCC_VERSION_MINOR) && \ + defined(OCC_VERSION_MAINTENANCE) + if (!s.empty()) s+= ", "; + s += "occ "+util::to_string(OCC_VERSION_MAJOR) + + "."+util::to_string(OCC_VERSION_MINOR) + + "."+util::to_string(OCC_VERSION_MAINTENANCE); +#endif + + // OpenNurbs +#if defined(OPENNURBS_VERSION) + if (!s.empty()) s+= ", "; + s += "onurbs "+util::to_string(OPENNURBS_VERSION); +#endif + + // Spectra library +#if defined(SPECTRA_MAJOR_VERSION) && \ + defined(SPECTRA_MINOR_VERSION) && \ + defined(SPECTRA_PATCH_VERSION) + if (!s.empty()) s+= ", "; + s += "spectra "+util::to_string(SPECTRA_MAJOR_VERSION) + + "."+util::to_string(SPECTRA_MINOR_VERSION) + + "."+util::to_string(SPECTRA_PATCH_VERSION); +#endif + + return s; + } + + std::string gsSysInfo::getCpuInfo() + { +#if defined(_WIN32) || defined(_WIN64) + + int CPUInfo[4] = {-1}; + unsigned nExIds, i = 0; + char CPUBrandString[0x40]; + + __cpuid(CPUInfo, 0x80000000); + nExIds = CPUInfo[0]; + + for (i=0x80000000; i<=nExIds; ++i) { + __cpuid(CPUInfo, i); + if (i == 0x80000002) + memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); + } + + return CPUBrandString; + +#elif __APPLE__ + + std::string CPUBrandString; + std::size_t size = 32; + + // Supply an oversized buffer, and avoid an extra call to sysctlbyname. + CPUBrandString.resize(size); + if (sysctlbyname("machdep.cpu.brand_string", &CPUBrandString[0], &size, NULL, 0) == 0 && size > 0) { + if (CPUBrandString[size-1] == '\0') + size--; + CPUBrandString.resize(size); + return CPUBrandString; + } + +#elif __linux__ || __unix__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) + + char CPUBrandString[0x40]; + unsigned int CPUInfo[4] = {0,0,0,0}; + + __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + unsigned int nExIds = CPUInfo[0]; + + memset(CPUBrandString, 0, sizeof(CPUBrandString)); + + for (unsigned int i = 0x80000000; i <= nExIds; ++i) + { + __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + + if (i == 0x80000002) + memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); + } + + return CPUBrandString; + +# else + + char hostname[HOST_NAME_MAX + 1]; + gethostname(hostname, HOST_NAME_MAX + 1); + + std::string str = "Unknown-CPU ["; + str += hostname; + str += "]"; + + return str; + +# endif + +#endif + + return "Unknown-CPU"; + } + + std::string gsSysInfo::getMemoryInfo() + { + uint64_t memsize = gsSysInfo::getMemoryInBytes(); + if (memsize>0) { + if (memsize<1024) + return util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + return util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + return util::to_string(memsize/(1024*1024))+" MB"; + else + return util::to_string(memsize/(1024*1024*1024))+" GB"; + } + else + return "Unknown-Memory"; + } + + uint64_t gsSysInfo::getMemoryInBytes() + { +#if defined(_WIN32) || defined(_WIN64) + + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + return (uint64_t)status.ullTotalPhys; + +#elif __APPLE__ + + int64_t memsize; + std::size_t size = sizeof(memsize); + + if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { + return (uint64_t)memsize; + } + +#elif __linux__ || __unix__ + + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return (uint64_t)(pages * page_size); + +#endif + + return 0; + } + +} // namespace gismo diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h new file mode 100644 index 0000000000..6d2fa9e9ab --- /dev/null +++ b/src/gsCore/gsSysInfo.h @@ -0,0 +1,53 @@ +/** @file gsSysInfo.h + + @brief Provides system information. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include + +namespace gismo +{ + + class GISMO_EXPORT gsSysInfo + { + public: + + /// Returns the version of G+Smo + static std::string getGismoVersion(); + + /// Returns the version of Eigen + static std::string getEigenVersion(); + + /// Returns the version of the compiler + static std::string getCompilerVersion(); + + /// Returns the version of the C++ standard + static std::string getCppVersion(); + + /// Returns the version of the standard library + static std::string getStdLibVersion(); + + /// Returns the version of extra libraries + static std::string getExtraLibsVersion(); + + /// Returns CPU information + static std::string getCpuInfo(); + + /// Returns memory information + static std::string getMemoryInfo(); + + /// Returns total system memory in bytes + static uint64_t getMemoryInBytes(); + }; // class gsSysInfo + +} // namespace gismo diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp new file mode 100644 index 0000000000..28b9b015b1 --- /dev/null +++ b/src/gsIO/gsBenchmark.cpp @@ -0,0 +1,358 @@ +/** @file gsBenchmark.cpp + + @brief Provides implemementation of generic benchmarking framework. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include +#include +#include + +#include + +namespace gismo +{ + std::ostream &gsBenchmarkResultSet::to_tikz(std::ostream &os) const + { + os << "\\pgfplotstableread[col sep=space]{\n" + << label << "\n"; + + for (const auto& it : results) + os << it.value << "\n"; + + os << "}\\data" << label << "\n"; + + return os; + } + + std::ostream &gsBenchmarkSet::to_tikz(std::ostream &os) const + { + for (const auto& it : results) + it.to_tikz(os); + + os << "\\begin{tikzpicture}\n" + << "\\begin{axis}[\n" + << "name=MyAxis,\n" + << "width=2\\textwidth,\n" + << "height=.8\\textwidth,\n" + << "legend pos=outer north east,\n" + << "ybar=0.05cm,\n" + << "bar width=3pt,\n" + << "ymajorgrids=true,\n" + << "xticklabel style={rotate=45,anchor=east},\n" + << "xticklabels={"; + + for (const auto& it : results) + os << it.get_descr() << (&it != &results.back() ? "," : ""); + + os << "},\n" + << "xtick=data,\n"; + + auto metric = results.front().get().cbegin()->metric; + if (metric & gismo::metric::speedup || metric & gismo::metric::ratio) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { + case gismo::metric::bandwidth_kb_sec: + case gismo::metric::bandwidth_mb_sec: + case gismo::metric::bandwidth_gb_sec: + case gismo::metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; + break; + case gismo::metric::perf_kflop_sec: + case gismo::metric::perf_mflop_sec: + case gismo::metric::perf_gflop_sec: + case gismo::metric::perf_tflop_sec: + os << "ylabel={Performance [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; + break; + case gismo::metric::runtime_sec: + os << "ylabel={Runtime [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } else { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { + case gismo::metric::bandwidth_kb_sec: + os << "ylabel={Bandwidth in KB/s},\n"; + break; + case gismo::metric::bandwidth_mb_sec: + os << "ylabel={Bandwidth in MB/s},\n"; + break; + case gismo::metric::bandwidth_gb_sec: + os << "ylabel={Bandwidth in GB/s},\n"; + break; + case gismo::metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth in TB/s},\n"; + break; + case gismo::metric::perf_kflop_sec: + os << "ylabel={Performance in kFLOP/s},\n"; + break; + case gismo::metric::perf_mflop_sec: + os << "ylabel={Performance in mFLOP/s},\n"; + break; + case gismo::metric::perf_gflop_sec: + os << "ylabel={Performance in gFLOP/s},\n"; + break; + case gismo::metric::perf_tflop_sec: + os << "ylabel={Performance in tFLOP/s},\n"; + break; + case gismo::metric::runtime_sec: + os << "ylabel={Runtime in seconds},\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } + + os << "title={" << descr + << " [real\\_t:" << util::type::name() + << ", index\\_t:" << util::type::name() + << ", short\\_t:" << util::type::name()<< "]},\n" + << "]\n"; + + for (auto rit=results.cbegin()+1; rit!=results.cend(); ++rit) + os << "\\pgfplotstablecreatecol[copy column from " + << "table={\\data" + << rit->get_label() + << "}{[index] 0}] {" + << rit->get_label() + << "} {\\data" + << results.cbegin()->get_label() + << "}\n"; + + os << "\\pgfplotstabletranspose[rows/threads/.style={string type}]\\mytable{" + << "\\data" + << results.cbegin()->get_label() + << "}\n"; + + for (std::size_t i=1; i<=results.front().get().size(); ++i) + os << "\\addplot table[x expr=\\coordindex, y index=" + << util::to_string(i) << "]{\\mytable};\n"; + + os << "\\legend{"; + for (const auto& it : results.front().get()) + os << "Threads=" << it.threads << (&it!=&results.front().get().back() ? "," : ""); + os << "}\n" + + << "\\end{axis}\n" + + << "\\gettikzxy{(MyAxis.south west)}{\\ax}{\\ay}\n" + << "\\gettikzxy{(MyAxis.outer south east)}{\\bx}{\\by}\n" + + << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " + << "node[draw,below right, align=left, text=black, text width=\\x2-\\x1-10pt, minimum width=\\x2-\\x1]\n" + << "at ($(\\ax, \\by-10pt)$) {%\n" + << "G+Smo " << gsSysInfo::getGismoVersion() + << ", Eigen " << gsSysInfo::getEigenVersion() + << " (" << gsSysInfo::getCompilerVersion() + << ", " << gsSysInfo::getCppVersion() + << ", " << gsSysInfo::getStdLibVersion() + << (gsSysInfo::getExtraLibsVersion().empty() + ? "), \n" + : ", "+gsSysInfo::getExtraLibsVersion()+"), \n") + + << "CPU " << gsSysInfo::getCpuInfo() << ", " + << "Memory " << gsSysInfo::getMemoryInfo() << "\\\\\n"; + + gsJITCompilerConfig jit; jit.load(GISMO_CONFIG_DIR "jit.xml"); + std::string flags = jit.getFlags(); + os << "Compiler flags "; + + for (auto token = strtok(&flags[0], " "); token != NULL; token = strtok(NULL, " ")) { + if (token[0] == '-') { + if (token[1] == 'I' || token[1] == 'L' || token[1] == 'l' || token[1] == 'W') + continue; + os << "\\verb!" << token << "! "; + } + } + + os << "};\n" + << "\\end{tikzpicture}\n"; + + return os; + } + + std::ostream &gsBenchmark::to_tikz(std::ostream &os) const + { + os << "\\documentclass[tikz]{standalone}\n" + << "\\usepackage{pgfplots}\n" + << "\\usepackage{pgfplotstable}\n" + << "\\usepackage{verbatim}\n" + << "\\pgfplotsset{compat=1.18}\n" + << "\\makeatletter\n" + << "\\newcommand{\\gettikzxy}[3]{%\n" + << "\\tikz@scan@one@point\\pgfutil@firstofone#1\\relax\n" + << "\\edef#2{\\the\\pgf@x}%\n" + << "\\edef#3{\\the\\pgf@y}%\n" + << "}\n" + << "\\makeatother\n" + << "\\begin{document}\n" + << "\\usetikzlibrary{calc}\n"; + + for (const auto& it : benchmarks) + it.to_tikz(os); + + os << "\\end{document}\n"; + return os; + } + + std::ostream &gsBenchmarkResultSet::print(std::ostream &os) const + { + os << std::setw(8) << descr << " | "; + for (const auto& it : results) + os << std::setw(4) << it.threads << " : " + << std::setw(6) << std::scientific << std::setprecision(2) << it.value; + os << "\n"; + return os; + } + + std::ostream &gsBenchmarkSet::print(std::ostream &os) const + { + os << "[" << label << "] " << descr << "\n"; + + if (results.size() == 0) + return os; + + os << std::setw(8) << "memsize" + << " | " + << util::to_string(results.front().get().size()) + << "x (#Threads : "; + + auto metric = results.front().get().cbegin()->metric; + if (metric & gismo::metric::speedup || metric & gismo::metric::ratio) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { + case gismo::metric::bandwidth_kb_sec: + case gismo::metric::bandwidth_mb_sec: + case gismo::metric::bandwidth_gb_sec: + case gismo::metric::bandwidth_tb_sec: + os << "Bandwidth [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; + break; + case gismo::metric::perf_kflop_sec: + case gismo::metric::perf_mflop_sec: + case gismo::metric::perf_gflop_sec: + case gismo::metric::perf_tflop_sec: + os << "Performance [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; + break; + case gismo::metric::runtime_sec: + os << "Runtime [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } else { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { + case gismo::metric::bandwidth_kb_sec: + os << "Bandwidth in KB/s)\n"; + break; + case gismo::metric::bandwidth_mb_sec: + os << "Bandwidth in MB/s)\n"; + break; + case gismo::metric::bandwidth_gb_sec: + os << "Bandwidth in GB/s)\n"; + break; + case gismo::metric::bandwidth_tb_sec: + os << "Bandwidth in TB/s)\n"; + break; + case gismo::metric::perf_kflop_sec: + os << "Performance in kFLOP/s)\n"; + break; + case gismo::metric::perf_mflop_sec: + os << "Performance in mFLOP/s)\n"; + break; + case gismo::metric::perf_gflop_sec: + os << "Performance in gFLOP/s)\n"; + break; + case gismo::metric::perf_tflop_sec: + os << "Performance in tFLOP/s)\n"; + break; + case gismo::metric::runtime_sec: + os << "Runtime in seconds)\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } + + for (const auto& it : results) + it.print(os); + return os; + } + + std::ostream &gsBenchmark::print(std::ostream &os) const + { + for (const auto& it : benchmarks) + it.print(os); + return os; + } + +namespace util { + + gsBenchmarkResultSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkResultSet objA, + const gsBenchmarkResultSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark result sets must have the same size"); + + std::vector results; + for (const auto& it : util::zip(objA.get(), objB.get())) { + gsBenchmarkResult result; + result.threads = std::get<0>(it).threads; + result.runtime = std::get<0>(it).runtime / std::get<1>(it).runtime; + result.value = std::get<0>(it).value / std::get<1>(it).value; + result.metric = (gismo::metric)(std::get<0>(it).metric + gismo::metric::ratio); + results.push_back( give(result) ); + } + + return gsBenchmarkResultSet(label, descr, give(results) ); + } + + gsBenchmarkSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkSet objA, + const gsBenchmarkSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark sets must have the same size"); + + std::vector results; + char id('A'); + + for (const auto& it : util::zip(objA.get(), objB.get())) { + results.push_back( give(util::ratio(std::get<0>(it).get_label()+std::string(1,id++), + std::get<0>(it).get_descr(), + std::get<0>(it), + std::get<1>(it))) ); + } + + gsBenchmarkSet benchmark(label, descr, give(results) ); + return benchmark; + } +} // namespace util + +} // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h new file mode 100644 index 0000000000..0a771855e4 --- /dev/null +++ b/src/gsIO/gsBenchmark.h @@ -0,0 +1,548 @@ +/** @file gsBenchmark.h + + @brief Provides a generic benchmarking framework. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include +#include +#include + +namespace gismo +{ +/** + @brief Enumerator that defines the benchmark metrics. + + These definitions are used to control the output of the benchmark framework +*/ +enum metric : uint64_t { + speedup = 1 << 0, + ratio = 1 << 1, + bandwidth_kb_sec = 1 << 2, + bandwidth_mb_sec = 1 << 3, + bandwidth_gb_sec = 1 << 4, + bandwidth_tb_sec = 1 << 5, + perf_kflop_sec = 1 << 6, + perf_mflop_sec = 1 << 7, + perf_gflop_sec = 1 << 8, + perf_tflop_sec = 1 << 9, + runtime_sec = 1 << 10 +}; + +/** + @brief Class that represents a single benchmark result + + A \a gsBenchmarkResult object is the most atomic unit of the + benchmark framework. It represents the result of a single run for a + fixed problem size and configuration and a fixed number of + threads. A series of runs for different numbers of threads is + collected in a \a gsBenchmarkResultSet object. +*/ +class gsBenchmarkResult +{ +public: + int threads; + double runtime; + double value; + gismo::metric metric; +}; + +namespace internal +{ +/// @brief Get a gsBenchmarkResult from XML data +template<> +class gsXml< gsBenchmarkResult > +{ +private: + gsXml() { } + typedef gsBenchmarkResult Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkResult"; } + static std::string type () { return "BenchmarkResult"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + + child = node->first_node("threads"); + if (child != NULL) obj.threads = atoi(child->value()); + + child = node->first_node("runtime"); + if (child != NULL) obj.runtime = atof(child->value()); + + child = node->first_node("value"); + if (child != NULL) obj.value = atof(child->value()); + + child = node->first_node("metric"); + if (child != NULL) obj.metric = (gismo::metric)atol(child->value()); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * result = makeNode("BenchmarkResult", data); + + result->append_node( makeNode("threads", util::to_string(obj.threads), data) ); + result->append_node( makeNode("runtime", util::to_string(obj.runtime), data) ); + result->append_node( makeNode("value", util::to_string(obj.value), data) ); + result->append_node( makeNode("metric", util::to_string(obj.metric), data) ); + + return result; + } +}; +} // namespace internal + +/** + @brief Class that represents a set of benchmark results + + A \a gsBenchmarkResultSet object holds a set of benchmark results + (\a gsBenchmarkResult) for a fixed problem size and configuration + but for different numbers of threads. +*/ +class gsBenchmarkResultSet +{ +public: + /// \brief Default constructor + gsBenchmarkResultSet() = default; + + /// \brief Constructor + gsBenchmarkResultSet(const std::string& label, + const std::string& descr, + const std::vector& results) + : label(label), + descr(descr), + results( give(std::vector(results)) ) {} + + /// \brief Constructor + gsBenchmarkResultSet(const std::string& label, + const std::string& descr, + std::vector&& results) + : label(label), + descr(descr), + results( give(results) ) {} + + /// \brief Returns the label + const std::string& get_label() const + { return label; } + + /// \brief Returns the descr + const std::string& get_descr() const + { return descr; } + + /// \brief Returns constant reference to the results + const std::vector& get() const + { return results; } + + /// \brief Returns non-constant reference to the results + std::vector& get() + { return results; } + + /// \brief Serializes the content to LaTeX TIKZ + std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content + std::ostream &print(std::ostream &os) const; + +private: + std::string label, descr; + std::vector results; +}; + +/// Print (as string) operator +inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkResultSet& obj) +{ return obj.print(os); } + +namespace internal +{ +/// @brief Get a gsBenchmarkResultSet from XML data +template<> +class gsXml< gsBenchmarkResultSet > +{ +private: + gsXml() { } + typedef gsBenchmarkResultSet Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkResultSet"; } + static std::string type () { return "BenchmarkResultSet"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + std::string label, descr; + + child = node->first_node("label"); + if (child != NULL) label = child->value(); + + child = node->first_node("descr"); + if (child != NULL) descr = child->value(); + + std::vector results; + + child = node->first_node(gsXml< gsBenchmarkResult >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkResult result; + gsXml< gsBenchmarkResult >::get_into(child, result); + results.push_back( give(result) ); + } + + obj = gsBenchmarkResultSet(label, descr, give(results)); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("BenchmarkResultSet", data); + + results->append_node( makeNode("label", obj.get_label(), data) ); + results->append_node( makeNode("descr", obj.get_descr(), data) ); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkResult >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + +/** + @brief Class that represents a collection of benchmark sets for a + series of benchmark instances + + This struct can be used to hold a series of benchmark instances + (i.e. a series of problem sizes and configurations)< +*/ +class gsBenchmarkSet +{ +public: + /// \brief Default Constructor + gsBenchmarkSet() = default; + + /// \brief Constructor + gsBenchmarkSet(const std::string& label, + const std::string& descr, + const std::vector& results) + : label(label), + descr(descr), + results( give(std::vector(results)) ) {} + + /// \brief Constructor + gsBenchmarkSet(const std::string& label, + const std::string& descr, + std::vector&& results) + : label(label), + descr(descr), + results( give(results) ) {} + + /// \brief Returns the label + const std::string& get_label() const + { return label; } + + /// \brief Returns the descr + const std::string& get_descr() const + { return descr; } + + /// \brief Returns constant reference to the result sets + const std::vector& get() const + { return results; } + + /// \brief Returns non-constant reference to the result sets + std::vector& get() + { return results; } + + /// \brief Serializes the content to LaTeX TIKZ + std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content + std::ostream &print(std::ostream &os) const; + +private: + std::string label, descr; + std::vector results; +}; + +/// Print (as string) operator +inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkSet& obj) +{ return obj.print(os); } + +namespace internal +{ +/// @brief Get a gsBenchmarkSet from XML data +template<> +class gsXml< gsBenchmarkSet > +{ +private: + gsXml() { } + typedef gsBenchmarkSet Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkSet"; } + static std::string type () { return "BenchmarkSet"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + std::string label, descr; + + child = node->first_node("label"); + if (child != NULL) label = child->value(); + + child = node->first_node("descr"); + if (child != NULL) descr = child->value(); + + std::vector results; + + child = node->first_node(gsXml< gsBenchmarkResultSet >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkResultSet _results; + gsXml< gsBenchmarkResultSet >::get_into(child, _results); + results.push_back( give(_results) ); + } + + obj = gsBenchmarkSet(label, descr, give(results) ); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("BenchmarkSet", data); + + results->append_node( makeNode("label", obj.get_label(), data) ); + results->append_node( makeNode("descr", obj.get_descr(), data) ); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkResultSet >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + +/** + @brief Class that represents a collection of benchmarks + + This is the top-level class of the benchmark framework and the only + one that should be used by the user directly. + */ +class GISMO_EXPORT gsBenchmark +{ +public: + /// \brief Returns constant reference to the benchmarks + const std::vector& get() const + { return benchmarks; } + + /// \brief Returns non-constant reference to the benchmarks + std::vector& get() + { return benchmarks; } + + /// \brief Serializes the content to LaTeX TIKZ + std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content + std::ostream &print(std::ostream &os) const; + + /// \brief Returns iterator to benchmark set + const std::vector::const_iterator find(const std::string& label) const + { + for (auto it = benchmarks.cbegin(); it != benchmarks.cend(); ++it) + if (it->get_label() == label) + return it; + return benchmarks.cend(); + } + + /// \brief Creates a new benchmark set, adds it to the benchmark and + /// returns a pointer to the benchmark set to the calling routine + template + const gsBenchmarkSet& create(const Iterator & sizes, + const std::vector & runs, + const std::vector & threads, + const std::string & extra_descr="") + { + //GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); + + gsInfo << "[" << Test::label() << "] " + << Test::descr()+extra_descr << "\n"; + + std::vector results; + char id('A'); + + auto riter = runs.begin(); + for (const auto& it : sizes) { + gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; + try { + Test test(it); + uint64_t memsize = test.size(); + std::string meminfo; + if (memsize<1024) + meminfo = util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + meminfo = util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + meminfo = util::to_string(memsize/(1024*1024))+" MB"; + else + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; + + results.push_back( give(gsBenchmarkResultSet(Test::label()+std::string(1,id++), meminfo, + give(gsBenchmark::run(test, Test::metric(), threads, *riter++)))) ); + } catch(...) { gsInfo << "[failed!]"; } + gsInfo << "..."; + } + gsInfo << "\n"; + + gsBenchmarkSet benchmark(Test::label(), Test::descr()+extra_descr, give(results) ); + benchmarks.push_back( give(benchmark) ); + return benchmarks.back(); + } + +private: + /// \brief Runs the benchmark instance \a benchmark for the + /// specified number of \a threads and \a runs and returns an \a + /// std::vector of \a gsBenchmarkResult that represent the + /// respective benchmark results measured in the specified \a metric + template + static std::vector + run(Test& test, gismo::metric metric, const std::vector& threads, index_t runs) + { + std::vector results; + gsStopwatch stopwatch; + uint64_t result(0); + real_t value, runtime; + + try { + for (const auto& it : threads) { + + omp_set_num_threads(it); + runtime = 0.0; + + stopwatch.restart(); + + for (index_t run=0; run(it); // number of OpenMP threads + result.runtime = runtime; // averaged elapsed time in seconds + result.value = value; // averaged benchmark value + result.metric = metric; // benchmark metric + results.push_back( give(result) ); + } + } catch(...) {} + + // Convert to relative values (speedup relative to first entry) + if (metric & gismo::metric::speedup) { + runtime = results.front().runtime; + value = results.front().value; + + for (auto& it : results) { + it.runtime = runtime / it.runtime; + it.value = value / it.value; + } + } + + return results; + } + +private: + std::vector benchmarks; +}; + +/// Print (as string) operator +inline std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) +{ return obj.print(os); } + +namespace internal +{ +/// @brief Get a gsBenchmark from XML data +template<> +class gsXml< gsBenchmark > +{ +private: + gsXml() { } + typedef gsBenchmark Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "Benchmark"; } + static std::string type () { return "Benchmark"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + + child = node->first_node(gsXml< gsBenchmarkSet >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkSet benchmark; + gsXml< gsBenchmarkSet >::get_into(child, benchmark); + obj.get().push_back( give(benchmark) ); + } + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("Benchmark", data); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkSet >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + +namespace util { + + /// \brief Returns the ratio of the two given benchmark result sets + GISMO_EXPORT gsBenchmarkResultSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkResultSet objA, + const gsBenchmarkResultSet objB); + + /// \brief Returns the ratio of the two given benchmark sets + GISMO_EXPORT gsBenchmarkSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkSet objA, + const gsBenchmarkSet objB); +} // namespace util + +} // namespace gismo diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index a70291986e..827f6edfce 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -23,6 +23,7 @@ //#include // --- end External files +#include #include namespace gismo @@ -426,57 +427,18 @@ void gsCmdLine::printVersion() gsInfo << "\n"; gsInfo << " G+Smo \n"; gsInfo << " Geometry plus Simulation modules\n"; - gsInfo << " version "<< GISMO_VERSION<<"\n"; - gsInfo << "Compiled by "; -//https://sourceforge.net/p/predef/wiki/Compilers, see also boost/predef.h -#if defined(_MSC_VER) && _MSC_VER < 1600 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<"199711L" <<", "; -#elsif _MSC_VER >= 1900 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<_MSVC_LANG <<", "; -#elsif _MSC_VER >= 1600 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<"201103L" <<", "; -#elif defined(__clang__ ) - gsInfo << "Clang "<<__clang_version__<<" ("<<__cplusplus <<", "; -#elif defined(_INTEL_COMPILER) - gsInfo << "Intel C++ "<<__INTEL_COMPILER<<" ("<<__cplusplus <<", "; -#elif defined(__MINGW64__) - gsInfo << "MinGW "<<__MINGW64_VERSION_MAJOR<<"."<<__MINGW64_VERSION_MINOR<<" ("<<__cplusplus <<", "; -#elif defined(__SUNPRO_CC) - gsInfo << "Solaris Studio "<<__SUNPRO_CC<<" ("<<__cplusplus <<", "; -#elif defined(__GNUG__) - gsInfo << "GNU GCC "<<__GNUC__<<"."<<__GNUC_MINOR__<<"."<<__GNUC_PATCHLEVEL__<<" ("<<__cplusplus <<", "; -#else - gsInfo << "C++ ("<<__cplusplus <<", "; -#endif - -#ifdef __INTEL_MKL__ - gsInfo << "MKL "<::name() + << ", index_t:" << util::type::name() + << ", short_t:" << util::type::name() << "\n"; gsInfo << "web: http://github.com/gismo\n"; } diff --git a/src/gsIO/gsOptionList.cpp b/src/gsIO/gsOptionList.cpp index 8a6cbb4df7..1282da95f8 100644 --- a/src/gsIO/gsOptionList.cpp +++ b/src/gsIO/gsOptionList.cpp @@ -629,7 +629,6 @@ gsXml::put (const gsOptionList & obj, gsXmlTree & data) return optionList; } - } // namespace internal #ifdef GISMO_BUILD_PYBIND11 diff --git a/src/gsIO/gsOptionList.h b/src/gsIO/gsOptionList.h index beb62a6faf..d42a30b81f 100644 --- a/src/gsIO/gsOptionList.h +++ b/src/gsIO/gsOptionList.h @@ -283,10 +283,8 @@ inline std::ostream &operator<<(std::ostream &os, const gsOptionList::OptionList inline bool operator< ( const gsOptionList::OptionListEntry& a, const gsOptionList::OptionListEntry& b ) { return a.label < b.label; } - namespace internal { - /** \brief Read OptionList from XML data \ingroup IO */ @@ -304,7 +302,6 @@ class GISMO_EXPORT gsXml static void get_into(gsXmlNode * node, gsOptionList & result); static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data); }; - } #ifdef GISMO_BUILD_PYBIND11 diff --git a/src/gsMatrix/gsAsMatrix.h b/src/gsMatrix/gsAsMatrix.h index e468a0e52c..2cab74c58d 100644 --- a/src/gsMatrix/gsAsMatrix.h +++ b/src/gsMatrix/gsAsMatrix.h @@ -53,7 +53,7 @@ class gsAsMatrix : public Eigen::Map< Eigen::Matrix > : Base( v.data(), n, m) { //GISMO_ASSERT( v.size() != 0, "Tried to map an empty vector." ); - GISMO_ASSERT( m*n <= index_t(v.size()), "Not enough coefficients in vector to map." ); + GISMO_ASSERT( m*n <= (index_t)(v.size()), "Not enough coefficients in vector to map." ); } gsAsMatrix( std::vector & v) @@ -162,7 +162,7 @@ class gsAsConstMatrix : public Eigen::Map< const Eigen::Matrix > gsAsConstMatrix( const std::vector & v, index_t n, index_t m) : Base( v.data(), n, m) { - GISMO_ASSERT( m*n <= index_t(v.size()), "Not enough coefficients in vector to map." ); + GISMO_ASSERT( m*n <= (index_t)(v.size()), "Not enough coefficients in vector to map." ); } gsAsConstMatrix( const std::vector & v) diff --git a/src/gsMatrix/gsSparseMatrix.h b/src/gsMatrix/gsSparseMatrix.h index 0f84806151..4b6b3f1666 100644 --- a/src/gsMatrix/gsSparseMatrix.h +++ b/src/gsMatrix/gsSparseMatrix.h @@ -400,7 +400,7 @@ class gsSparseMatrix : public Eigen::SparseMatrix return result; } - gsVector nonZerosPerInner(index_t upto = std::numeric_limits::max()) const + gsVector nonZerosPerInner(index_t upto = 2000000000) const { upto = math::min(upto, this->cols()); gsVector nz(upto); diff --git a/src/gsNurbs/gsBoehm.hpp b/src/gsNurbs/gsBoehm.hpp index 7499c741f8..590c0dfd49 100644 --- a/src/gsNurbs/gsBoehm.hpp +++ b/src/gsNurbs/gsBoehm.hpp @@ -39,7 +39,7 @@ void gsBoehm( if (r==1) return gsBoehmSingle(knots, coefs, val, update_knots); - GISMO_ASSERT( coefs.rows() == index_t(knots.size() - knots.degree()-1), + GISMO_ASSERT( coefs.rows() == (index_t)(knots.size() - knots.degree()-1), "Incompatible coefficients("< +#include namespace gismo { diff --git a/src/gsMpi/gsMpi.h b/src/gsParallel/gsMpi.h similarity index 98% rename from src/gsMpi/gsMpi.h rename to src/gsParallel/gsMpi.h index 029fc74ec5..eb3d1173e3 100644 --- a/src/gsMpi/gsMpi.h +++ b/src/gsParallel/gsMpi.h @@ -25,11 +25,11 @@ // # warning "The MPI version is older than MPI-2." // # endif //#endif -#include -#include +#include +#include #endif -#include +#include namespace gismo { diff --git a/src/gsMpi/gsMpiComm.h b/src/gsParallel/gsMpiComm.h similarity index 99% rename from src/gsMpi/gsMpiComm.h rename to src/gsParallel/gsMpiComm.h index 221409e8a4..1c2dee0665 100644 --- a/src/gsMpi/gsMpiComm.h +++ b/src/gsParallel/gsMpiComm.h @@ -19,6 +19,7 @@ namespace gismo { #ifndef GISMO_WITH_MPI +typedef int MPI_Comm; typedef int MPI_Group; typedef int MPI_Request; struct MPI_Status {}; @@ -335,7 +336,8 @@ class gsSerialComm #ifdef GISMO_WITH_MPI operator MPI_Comm () const { return MPI_COMM_SELF;} - //#else +#else + operator MPI_Comm () const { return 0;} // typedef int MPI_Group; // typedef int MPI_Request; // struct MPI_Status {}; @@ -387,7 +389,7 @@ class gsSerialComm operator< */ template - static T min (T& in) + static T (min) (T& in) { return in; } @@ -397,7 +399,7 @@ class gsSerialComm in every process. Assumes that T has an operator< */ template - static int min (T* inout, int len) + static int (min) (T* inout, int len) { return 0; } @@ -407,7 +409,7 @@ class gsSerialComm operator< */ template - static T max (T& in) + static T (max) (T& in) { return in; } @@ -417,7 +419,7 @@ class gsSerialComm process. Assumes that T has an operator< */ template - static int max (T* inout, int len) + static int (max) (T* inout, int len) { return 0; } diff --git a/src/gsMpi/gsMpiTraits.h b/src/gsParallel/gsMpiTraits.h similarity index 100% rename from src/gsMpi/gsMpiTraits.h rename to src/gsParallel/gsMpiTraits.h diff --git a/src/gsParallel/gsOpenMP.cpp b/src/gsParallel/gsOpenMP.cpp new file mode 100644 index 0000000000..192b2a0873 --- /dev/null +++ b/src/gsParallel/gsOpenMP.cpp @@ -0,0 +1,440 @@ +/** @file gsOpenMP.cpp + + @brief Implementation of OpenMP stub routines to be used when libomp is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#if !defined(_OPENMP) + +#include +#include + +#include +#include +#include +#include +#include + +void omp_set_num_threads(int num_threads) +{} + +int omp_get_num_threads(void) +{ + return 1; +} + +int omp_get_max_threads(void) +{ + return 1; +} + +int omp_get_thread_num(void) +{ + return 0; +} + +int omp_get_num_procs(void) +{ + return 1; +} + +int omp_in_parallel(void) +{ + return 0; +} + +void omp_set_dynamic(int dynamic_threads) +{} + +int omp_get_dynamic(void) +{ + return 0; +} + +int omp_get_cancellation(void) +{ + return 0; +} + +void omp_set_nested(int nested) +{} + +int omp_get_nested(void) +{ + return 0; +} + +void omp_set_schedule(omp_sched_t kind, int chunk_size) +{} + +void omp_get_schedule(omp_sched_t *kind, int *chunk_size) +{ + *kind = omp_sched_static; + *chunk_size = 0; +} + +int omp_get_thread_limit(void) +{ + return 1; +} + +void omp_set_max_active_levels(int max_active_levels) +{} + +int omp_get_max_active_levels(void) +{ + return 0; +} + +int omp_get_level(void) +{ + return 0; +} + +int omp_get_ancestor_thread_num(int level) +{ + return level == 0 ? 0 : -1; +} + +int omp_get_team_size(int level) +{ + return level == 0 ? 1 : -1; +} + +int omp_get_active_level(void) +{ + return 0; +} + +int omp_in_final(void) +{ + return 1; +} + +omp_proc_bind_t omp_get_proc_bind(void) +{ + return omp_proc_bind_false; +} + +int omp_get_num_places(void) +{ + return 0; +} + +int omp_get_place_num_procs(int place_num) +{ + return 0; +} + +void omp_get_place_proc_ids(int place_num, int *ids) +{} + +int omp_get_place_num(void) +{ + return -1; +} + +int omp_get_partition_num_places(void) +{ + return 0; +} + +void omp_get_partition_place_nums(int *place_nums) +{} + +void omp_set_default_device(int device_num) +{} + +int omp_get_default_device(void) +{ + return 0; +} + +int omp_get_num_devices(void) +{ + return 0; +} + +int omp_get_num_teams(void) +{ + return 1; +} + +int omp_get_team_num(void) +{ + return 0; +} + +int omp_is_initial_device(void) +{ + return 1; +} + +int omp_get_initial_device(void) +{ + return -10; +} + +int omp_get_max_task_priority(void) +{ + return 0; +} + +void omp_init_lock(omp_lock_t *arg) +{ + arg->lock = OMP_UNLOCKED; +} + +void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) +{ + omp_init_lock(arg); +} + +void omp_destroy_lock(omp_lock_t *arg) +{ + arg->lock = OMP_INIT; +} + +void omp_set_lock(omp_lock_t *arg) +{ + if (arg->lock == OMP_UNLOCKED) + { + arg->lock = OMP_LOCKED; + } + else if (arg->lock == OMP_LOCKED) + { + fprintf(stderr, "error: deadlock in using lock variable\n"); + exit(1); + } + else + { + exit(1); + } +} + +void omp_unset_lock(omp_lock_t *arg) +{ + if (arg->lock == OMP_LOCKED) + { + arg->lock = OMP_UNLOCKED; + } + else if (arg->lock == OMP_UNLOCKED) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +int omp_test_lock(omp_lock_t *arg) +{ + if (arg->lock == OMP_UNLOCKED) + { + arg->lock = OMP_LOCKED; + return 1; + } + else if (arg->lock == OMP_LOCKED) + { + return 0; + } + else { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +void omp_init_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = OMP_NOOWNER; + arg->count = 0; +} + +void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint) +{ + omp_init_nest_lock(arg); +} + +void omp_destroy_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = OMP_NOOWNER; + arg->count = OMP_UNLOCKED; +} + +void omp_set_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == OMP_MASTER && arg->count >= 1) + { + arg->count++; + } + else if (arg->owner == OMP_NOOWNER && arg->count == 0) + { + arg->owner = OMP_MASTER; + arg->count = 1; + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +void omp_unset_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == OMP_MASTER && arg->count >= 1) + { + arg->count--; + if (arg->count == 0) + { + arg->owner = OMP_NOOWNER; + } + } + else if (arg->owner == OMP_NOOWNER && arg->count == 0) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +int omp_test_nest_lock(omp_nest_lock_t *arg) +{ + omp_set_nest_lock(arg); + return arg->count; +} + +double omp_get_wtime(void) +{ + /* This function does not provide a working + * wallclock timer. Replace it with a version + * customized for the target machine. + */ + return 0.0; +} + +double omp_get_wtick(void) +{ + /* This function does not provide a working + * clock tick function. Replace it with + * a version customized for the target machine. + */ + return 365. * 86400.; +} + +void * omp_target_alloc(size_t size, int device_num) +{ + if (device_num != -10) + return NULL; + return malloc(size); +} + +void omp_target_free(void *device_ptr, int device_num) +{ + free(device_ptr); +} + +int omp_target_is_present(void *ptr, int device_num) +{ + return 1; +} + +int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device) +{ + // only the default device is valid in a stub + if (dst_device != -10 || src_device != -10 + || ! dst || ! src ) + return EINVAL; + memcpy((char *)dst + dst_offset, + (char *)src + src_offset, + length); + return 0; +} + +int omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num) +{ + int ret=0; + // Both null, return number of dimensions supported, + // this stub supports an arbitrary number + if (dst == NULL && src == NULL) return INT_MAX; + + if (!volume || !dst_offsets || !src_offsets + || !dst_dimensions || !src_dimensions + || num_dims < 1 ) { + ret = EINVAL; + goto done; + } + if (num_dims == 1) { + ret = omp_target_memcpy(dst, src, + element_size * volume[0], + dst_offsets[0] * element_size, + src_offsets[0] * element_size, + dst_device_num, src_device_num); + if(ret) goto done; + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; i < num_dims; i++) { + dst_slice_size *= dst_dimensions[i]; + src_slice_size *= src_dimensions[i]; + } + size_t dst_off = dst_offsets[0] * dst_slice_size; + size_t src_off = src_offsets[0] * src_slice_size; + for (size_t i=0; i < volume[0]; i++) { + ret = omp_target_memcpy_rect( + (char *)dst + dst_off + dst_slice_size*i, + (char *)src + src_off + src_slice_size*i, + element_size, + num_dims - 1, + volume + 1, + dst_offsets + 1, + src_offsets + 1, + dst_dimensions + 1, + src_dimensions + 1, + dst_device_num, + src_device_num); + if (ret) goto done; + } + } +done: + return ret; +} + +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num) +{ + // No association is possible because all host pointers + // are considered present + return EINVAL; +} + +int omp_target_disassociate_ptr(void *ptr, int device_num) +{ + return EINVAL; +} +#endif // !defined(_OPENMP) diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h new file mode 100644 index 0000000000..69a48911b8 --- /dev/null +++ b/src/gsParallel/gsOpenMP.h @@ -0,0 +1,254 @@ +/** @file gsOpenMP.h + + @brief OpenMP stub routines to be used when omp.h is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#ifdef _OPENMP + +#if _OPENMP >= 202111 +#define GISMO_HAS_OPENMP_52 1 +#else +#define GISMO_HAS_OPENMP_52 0 +#endif + +#if _OPENMP >= 202011 +#define GISMO_HAS_OPENMP_51 1 +#else +#define GISMO_HAS_OPENMP_51 0 +#endif + +#if _OPENMP >= 201811 +#define GISMO_HAS_OPENMP_50 1 +#else +#define GISMO_HAS_OPENMP_50 0 +#endif + +#if _OPENMP >= 201511 +#define GISMO_HAS_OPENMP_45 1 +#else +#define GISMO_HAS_OPENMP_45 0 +#endif + +#if _OPENMP >= 201307 +#define GISMO_HAS_OPENMP_40 1 +#else +#define GISMO_HAS_OPENMP_40 0 +#endif + +#if _OPENMP >= 201107 +#define GISMO_HAS_OPENMP_31 1 +#else +#define GISMO_HAS_OPENMP_31 0 +#endif + +#if _OPENMP >= 200805 +#define GISMO_HAS_OPENMP_30 1 +#else +#define GISMO_HAS_OPENMP_30 0 +#endif + +#if _OPENMP >= 200505 +#define GISMO_HAS_OPENMP_25 1 +#else +#define GISMO_HAS_OPENMP_25 0 +#endif + +#include + +#else + +#define GISMO_HAS_OPENMP_52 0 +#define GISMO_HAS_OPENMP_51 0 +#define GISMO_HAS_OPENMP_50 0 +#define GISMO_HAS_OPENMP_45 0 +#define GISMO_HAS_OPENMP_40 0 +#define GISMO_HAS_OPENMP_31 0 +#define GISMO_HAS_OPENMP_30 0 +#define GISMO_HAS_OPENMP_25 0 + +#include + +void GISMO_EXPORT omp_set_num_threads(int num_threads); + +int GISMO_EXPORT omp_get_num_threads(void); + +int GISMO_EXPORT omp_get_max_threads(void); + +int GISMO_EXPORT omp_get_thread_num(void); + +int GISMO_EXPORT omp_get_num_procs(void); + +int GISMO_EXPORT omp_in_parallel(void); + +void GISMO_EXPORT omp_set_dynamic(int dynamic_threads); + +int GISMO_EXPORT omp_get_dynamic(void); + +int GISMO_EXPORT omp_get_cancellation(void); + +void GISMO_EXPORT omp_set_nested(int nested); + +int GISMO_EXPORT omp_get_nested(void); + +typedef enum omp_sched_t { + omp_sched_static = 1, + omp_sched_dynamic = 2, + omp_sched_guided = 3, + omp_sched_auto = 4, + omp_sched_monotonic = 0x80000000 +} omp_sched_t; + +void GISMO_EXPORT omp_set_schedule(omp_sched_t kind, int chunk_size); + +void GISMO_EXPORT omp_get_schedule(omp_sched_t *kind, int *chunk_size); + +int GISMO_EXPORT omp_get_thread_limit(void); + +void GISMO_EXPORT omp_set_max_active_levels(int max_active_levels); + +int GISMO_EXPORT omp_get_max_active_levels(void); + +int GISMO_EXPORT omp_get_level(void); + +int GISMO_EXPORT omp_get_ancestor_thread_num(int level); + +int GISMO_EXPORT omp_get_team_size(int level); + +int GISMO_EXPORT omp_get_active_level(void); + +int GISMO_EXPORT omp_in_final(void); + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +omp_proc_bind_t omp_get_proc_bind(void); + +int GISMO_EXPORT omp_get_num_places(void); + +int GISMO_EXPORT omp_get_place_num_procs(int place_num); + +void GISMO_EXPORT omp_get_place_proc_ids(int place_num, int *ids); + +int GISMO_EXPORT omp_get_place_num(void); + +int GISMO_EXPORT omp_get_partition_num_places(void); + +void GISMO_EXPORT omp_get_partition_place_nums(int *place_nums); + +void GISMO_EXPORT omp_set_default_device(int device_num); + +int GISMO_EXPORT omp_get_default_device(void); + +int GISMO_EXPORT omp_get_num_devices(void); + +int GISMO_EXPORT omp_get_num_teams(void); + +int GISMO_EXPORT omp_get_team_num(void); + +int GISMO_EXPORT omp_is_initial_device(void); + +int GISMO_EXPORT omp_get_initial_device(void); + +int GISMO_EXPORT omp_get_max_task_priority(void); + +typedef struct omp_lock_t { + int lock; +} omp_lock_t; + +enum { OMP_UNLOCKED = -1, OMP_INIT, OMP_LOCKED }; + +void GISMO_EXPORT omp_init_lock(omp_lock_t *arg); + +typedef enum omp_sync_hint_t { + omp_sync_hint_none = 0, + omp_lock_hint_none = omp_sync_hint_none, + omp_sync_hint_uncontended = 1, + omp_lock_hint_uncontended = omp_sync_hint_uncontended, + omp_sync_hint_contended = (1<<1), + omp_lock_hint_contended = omp_sync_hint_contended, + omp_sync_hint_nonspeculative = (1<<2), + omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative, + omp_sync_hint_speculative = (1<<3), + omp_lock_hint_speculative = omp_sync_hint_speculative, + kmp_lock_hint_hle = (1<<16), + kmp_lock_hint_rtm = (1<<17), + kmp_lock_hint_adaptive = (1<<18) +} omp_sync_hint_t; + +typedef omp_sync_hint_t omp_lock_hint_t; + +void GISMO_EXPORT omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint); + +void GISMO_EXPORT omp_destroy_lock(omp_lock_t *arg); + +void GISMO_EXPORT omp_set_lock(omp_lock_t *arg); + +void GISMO_EXPORT omp_unset_lock(omp_lock_t *arg); + +int GISMO_EXPORT omp_test_lock(omp_lock_t *arg); + +typedef struct omp_nest_lock_t { + int owner; + int count; +} omp_nest_lock_t; + +enum { OMP_NOOWNER = -1, OMP_MASTER = 0 }; + +void GISMO_EXPORT omp_init_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint); + +void GISMO_EXPORT omp_destroy_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_set_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_unset_nest_lock(omp_nest_lock_t *arg); + +int GISMO_EXPORT omp_test_nest_lock(omp_nest_lock_t *arg); + +double omp_get_wtime(void); + +double omp_get_wtick(void); + +void * omp_target_alloc(size_t size, int device_num); + +void GISMO_EXPORT omp_target_free(void *device_ptr, int device_num); + +int GISMO_EXPORT omp_target_is_present(void *ptr, int device_num); + +int GISMO_EXPORT omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device); + +int GISMO_EXPORT omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num); + +int GISMO_EXPORT omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num); + +int GISMO_EXPORT omp_target_disassociate_ptr(void *ptr, int device_num); +#endif // _OPENMP diff --git a/src/gsUtils/gsUtils.h b/src/gsUtils/gsUtils.h index 17900d6586..de0552b3e0 100644 --- a/src/gsUtils/gsUtils.h +++ b/src/gsUtils/gsUtils.h @@ -13,12 +13,17 @@ #pragma once -#include +#include +#include #include +#include +#include +#include #include #include #include +#include #ifdef __GNUC__ #include @@ -37,7 +42,7 @@ namespace gismo */ namespace util { - + #if __cplusplus >= 201103L || _MSC_VER >= 1600 template // we catch up char arrays std::string to_string(C (& value)[N]) @@ -59,6 +64,16 @@ std::string to_string(const C & value) return convert.str(); } +/// \brief Converts value to string, assuming "operator<<" defined on C +/// \ingroup Utils +template +std::string to_string(const C & value, int digits) +{ + std::ostringstream convert; + convert << std::scientific << std::setprecision(digits) << value; + return convert.str(); +} + /// \brief Checks if a string \a haystack begins with the string \a needle /// \ingroup Utils inline bool starts_with( const std::string & haystack, const std::string & needle ) @@ -283,6 +298,160 @@ size_t size(const T& t) } #endif +#if __cplusplus >= 201300L +template +using integer_sequence = std::integer_sequence; + +template +using index_sequence = std::index_sequence; + +template +using make_integer_sequence = std::make_integer_sequence; + +template +using make_index_sequence = std::make_index_sequence; + +template +using index_sequence_for = std::index_sequence_for; + +#else + +/// \brief Backport of std::integer_sequence from C++14 +template +struct integer_sequence +{ + typedef T value_type; + static constexpr std::size_t size() { return sizeof...(Ints); } +}; + +/// \brief Backport of std::index_sequence from C++14 +template +using index_sequence = integer_sequence; + +/// \brief Backport of std::make_integer_sequence from C++14 +//@{ +template +struct make_integer_sequence : make_integer_sequence {}; + +template +struct make_integer_sequence : integer_sequence {}; + +template +using make_index_sequence = make_integer_sequence; +//@} + +/// \brief Backport of std::index_sequence_for from C++14 +template +using index_sequence_for = make_index_sequence; +#endif + +namespace // anonymous namespace +{ + +template +class zip_helper { +public: + class iterator + : std::iterator().begin())...>> { + private: + std::tuple().begin())...> iters_; + + template + auto deref(index_sequence) + -> decltype(typename iterator::value_type{*std::get(iters_)...}) + const { + return typename iterator::value_type{*std::get(iters_)...}; + } + + template + void increment(index_sequence) { + auto l = {(++std::get(iters_), 0)...}; + GISMO_UNUSED(l); + } + + public: + explicit iterator(decltype(iters_) iters) : iters_{std::move(iters)} {} + + iterator& operator++() { + increment(index_sequence_for{}); + return *this; + } + + iterator operator++(int) { + auto saved{*this}; + increment(index_sequence_for{}); + return saved; + } + + bool operator!=(const iterator& other) const { + return iters_ != other.iters_; + } + + auto operator*() + -> decltype(deref(index_sequence_for{})) + const { return deref(index_sequence_for{}); } + }; + + zip_helper(T&... seqs) + : begin_{std::make_tuple(seqs.begin()...)}, + end_{std::make_tuple(seqs.end()...)} {} + + iterator begin() const { return begin_; } + iterator end() const { return end_; } + +private: + iterator begin_; + iterator end_; +}; + +} // end anonymous namespace + +/// \brief Creates a zip iterator +template +auto zip(T&&... seqs) + -> zip_helper +{ + return zip_helper{seqs...}; +} + +namespace // anonymous +{ +template +std::ostringstream& tuple_to_stream(std::ostringstream &oss, T &&arg) { + oss << arg; + return oss; +} + +template +std::ostringstream& tuple_to_stream(std::ostringstream &oss, First &&firstArg, Rest &&... restArgs) { + oss << firstArg << ", "; + return tuple_to_stream(oss, std::forward(restArgs)...); +} + +template +std::string tuple_to_string(Types &&... args) { + std::ostringstream oss; + oss << '['; + tuple_to_stream(oss, std::forward(args)...); + oss << ']'; + return oss.str(); +} + +template +std::string tuple_to_string_cxx11_compatibility(const Tuple &tuple, util::index_sequence) { + return tuple_to_string(std::get(tuple)...); +}; + +} // end anonymous namespace + +/// \brief Converts tuple to string, assuming "operator<<" defined on all items +/// \ingroup Utils +template +std::string to_string(const std::tuple &tuple) { + return tuple_to_string_cxx11_compatibility(tuple, util::make_index_sequence{}); +}; + } // end namespace util // This macro assumes the operators == and < to be present and