From 5316b3f3ef4dc1ab1a72ae5a87d980ad4980fa2d Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 3 Nov 2020 16:56:39 +0100 Subject: [PATCH 001/174] Added gsXBraid extension --- CMakeLists.txt | 7 +++ cmake/gsOptions.cmake | 5 ++ extensions/gsXBraid/CMakeLists.txt | 71 +++++++++++++++++++++ extensions/gsXBraid/gsXBraid.h | 99 ++++++++++++++++++++++++++++++ extensions/gsXBraid/gsXBraid.hpp | 18 ++++++ extensions/gsXBraid/gsXBraid_.cpp | 4 ++ 6 files changed, 204 insertions(+) create mode 100644 extensions/gsXBraid/CMakeLists.txt create mode 100644 extensions/gsXBraid/gsXBraid.h create mode 100644 extensions/gsXBraid/gsXBraid.hpp create mode 100644 extensions/gsXBraid/gsXBraid_.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 72a7661ef3..c7f8a129f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,6 +290,13 @@ if(GISMO_WITH_SMESH) #include_directories(${SMESH_INCLUDE_DIR}) endif() +if(GISMO_WITH_XBRAID) + add_subdirectory(extensions/gsXBraid) + set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "${PROJECT_NAME} include directories") + #include_directories(${XBRAID_INCLUDE_DIR}) +endif(GISMO_WITH_XBRAID) + #second time include_directories(${GISMO_INCLUDE_DIRS}) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index f2d114de0a..a4cff31029 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -174,6 +174,11 @@ if (${GISMO_WITH_UNUM}) message (" GISMO_WITH_UNUM ${GISMO_WITH_UNUM}") endif() +option(GISMO_WITH_XBRAID "With XBraid" false ) +if (${GISMO_WITH_XBRAID}) +message (" GISMO_WITH_XBRAID ${GISMO_WITH_XBRAID}") +endif() + ## ################################################################# ## Options list: Extra options ## ################################################################# diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt new file mode 100644 index 0000000000..44a1567e38 --- /dev/null +++ b/extensions/gsXBraid/CMakeLists.txt @@ -0,0 +1,71 @@ +### CMakeLists.txt --- +## +## Author: Angelos Mantzaflaris +## Copyright (C) 2016 - RICAM-Linz. +###################################################################### + +## XBraid extension +project(gsXBraidExtension) + +# Collect file names +aux_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HEADERS) +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_SOURCES) +aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) + +# Apply same configuration as G+Smo +include(gsConfig) + +# Look for pre-installed XBraid libraries +find_package(XBRAID QUIET) + +if (NOT XBRAID_FOUND) + + set(XBRAID_VER "v3.0.0") + gismo_fetch_directory(XBraid + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + DESTINATION external + ) + + if( (NOT GISMO_BUILD_LIB) ) + aux_instance_directory (${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_INS) + if(${PROJECT_NAME}_INS) + LIST( REMOVE_ITEM ${PROJECT_NAME}_CPP ${${PROJECT_NAME}_INS}) + endif() + endif() + + aux_header_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_HEADERS_XBRAID) + aux_source_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_SOURCES_XBRAID) + aux_tmpl_header_directory(${gismo_externals} ${PROJECT_NAME}_HPPFILES_XBRAID) + + include_directories(${gismo_externals}/XBraid/braid) + +endif (NOT XBRAID_FOUND) + +# Add object library +add_library(${PROJECT_NAME} OBJECT + ${${PROJECT_NAME}_HEADERS} + ${${PROJECT_NAME}_HPPFILES} + ${${PROJECT_NAME}_SOURCES} + ${${PROJECT_NAME}_HEADERS_XBRAID} + ${${PROJECT_NAME}_HPPFILES_XBRAID} + ${${PROJECT_NAME}_SOURCES_XBRAID} + ) + +set_target_properties(${PROJECT_NAME} PROPERTIES + COMPILE_DEFINITIONS gismo_EXPORTS + POSITION_INDEPENDENT_CODE ON + LINKER_LANGUAGE CXX + FOLDER "G+Smo extensions" ) + +if( GISMO_WITH_MPI ) + target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) +else() + add_definitions("-Dbraid_SEQUENTIAL") +endif() + +set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ + CACHE INTERNAL "Gismo extensions to be included") + +install(DIRECTORY ${PROJECT_SOURCE_DIR} + DESTINATION include/gismo/gsXBraid + FILES_MATCHING PATTERN "*.h") diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h new file mode 100644 index 0000000000..0744576f16 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.h @@ -0,0 +1,99 @@ +/** @file gsXBraid.h + + @brief Provides declarations of the XBraid wrapper + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include + +namespace gismo { + + /** + \brief Class defining the XBraid wrapper + */ + + class gsXBraid : public BraidApp + { + public: + + // Default constructor + gsXBraid() = delete; + + // Constructor + gsXBraid(MPI_Comm comm, + int rank, + double start, + double stop, + int timesteps); + + // Destructor + virtual ~gsXBraid(); + + // Define all the Braid Wrapper routines + virtual int Step(braid_Vector u_, + braid_Vector ustop_, + braid_Vector fstop_, + BraidStepStatus &pstatus) = 0; + + virtual int Clone(braid_Vector u_, + braid_Vector *v_ptr) = 0 ; + + virtual int Init(double t, + braid_Vector *u_ptr) = 0; + + virtual int Free(braid_Vector u_) = 0; + + virtual int Sum(double alpha, + braid_Vector x_, + double beta, + braid_Vector y_) = 0; + + virtual int SpatialNorm(braid_Vector u_, + double *norm_ptr) = 0; + + virtual int BufSize(int *size_ptr, + BraidBufferStatus &status) = 0; + + virtual int BufPack(braid_Vector u_, + void *buffer, + BraidBufferStatus &status) = 0; + + virtual int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + virtual int Access(braid_Vector u_, + BraidAccessStatus &astatus) = 0; + + // Not needed in this example + virtual int Residual(braid_Vector u_, + braid_Vector r_, + BraidStepStatus &pstatus) = 0; + + // Not needed in this example + virtual int Coarsen(braid_Vector fu_, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) = 0; + + // Not needed in this example + virtual int Refine(braid_Vector cu_, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) = 0; + + protected: + int rank; + + }; + +}// namespace gismo + +#ifndef GISMO_BUILD_LIB +#include GISMO_HPP_HEADER(gsXBraid.hpp) +#endif diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp new file mode 100644 index 0000000000..05581177b0 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -0,0 +1,18 @@ +/** @file gsXBraid.hpp + + @brief Provides implementations of the XBraid wrapper. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +namespace gismo { + +}// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp new file mode 100644 index 0000000000..d67c74258c --- /dev/null +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -0,0 +1,4 @@ + +#include +#include +#include From ca17c7acc6ee3c541f099aee8e4a18cb153c0a29 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 3 Nov 2020 16:56:39 +0100 Subject: [PATCH 002/174] Added gsXBraid extension --- CMakeLists.txt | 7 +++ cmake/gsOptions.cmake | 5 ++ extensions/gsXBraid/CMakeLists.txt | 71 +++++++++++++++++++++ extensions/gsXBraid/gsXBraid.h | 99 ++++++++++++++++++++++++++++++ extensions/gsXBraid/gsXBraid.hpp | 18 ++++++ extensions/gsXBraid/gsXBraid_.cpp | 4 ++ 6 files changed, 204 insertions(+) create mode 100644 extensions/gsXBraid/CMakeLists.txt create mode 100644 extensions/gsXBraid/gsXBraid.h create mode 100644 extensions/gsXBraid/gsXBraid.hpp create mode 100644 extensions/gsXBraid/gsXBraid_.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dd5fcd0ec7..988e8a80bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -265,6 +265,13 @@ if(GISMO_WITH_SMESH) #include_directories(${SMESH_INCLUDE_DIR}) endif() +if(GISMO_WITH_XBRAID) + add_subdirectory(extensions/gsXBraid) + set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "${PROJECT_NAME} include directories") + #include_directories(${XBRAID_INCLUDE_DIR}) +endif(GISMO_WITH_XBRAID) + #second time include_directories(${GISMO_INCLUDE_DIRS}) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index 2e96735736..2006a6a1e1 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -174,6 +174,11 @@ if (${GISMO_WITH_UNUM}) message (" GISMO_WITH_UNUM ${GISMO_WITH_UNUM}") endif() +option(GISMO_WITH_XBRAID "With XBraid" false ) +if (${GISMO_WITH_XBRAID}) +message (" GISMO_WITH_XBRAID ${GISMO_WITH_XBRAID}") +endif() + ## ################################################################# ## Options list: Extra options ## ################################################################# diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt new file mode 100644 index 0000000000..44a1567e38 --- /dev/null +++ b/extensions/gsXBraid/CMakeLists.txt @@ -0,0 +1,71 @@ +### CMakeLists.txt --- +## +## Author: Angelos Mantzaflaris +## Copyright (C) 2016 - RICAM-Linz. +###################################################################### + +## XBraid extension +project(gsXBraidExtension) + +# Collect file names +aux_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HEADERS) +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_SOURCES) +aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) + +# Apply same configuration as G+Smo +include(gsConfig) + +# Look for pre-installed XBraid libraries +find_package(XBRAID QUIET) + +if (NOT XBRAID_FOUND) + + set(XBRAID_VER "v3.0.0") + gismo_fetch_directory(XBraid + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + DESTINATION external + ) + + if( (NOT GISMO_BUILD_LIB) ) + aux_instance_directory (${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_INS) + if(${PROJECT_NAME}_INS) + LIST( REMOVE_ITEM ${PROJECT_NAME}_CPP ${${PROJECT_NAME}_INS}) + endif() + endif() + + aux_header_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_HEADERS_XBRAID) + aux_source_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_SOURCES_XBRAID) + aux_tmpl_header_directory(${gismo_externals} ${PROJECT_NAME}_HPPFILES_XBRAID) + + include_directories(${gismo_externals}/XBraid/braid) + +endif (NOT XBRAID_FOUND) + +# Add object library +add_library(${PROJECT_NAME} OBJECT + ${${PROJECT_NAME}_HEADERS} + ${${PROJECT_NAME}_HPPFILES} + ${${PROJECT_NAME}_SOURCES} + ${${PROJECT_NAME}_HEADERS_XBRAID} + ${${PROJECT_NAME}_HPPFILES_XBRAID} + ${${PROJECT_NAME}_SOURCES_XBRAID} + ) + +set_target_properties(${PROJECT_NAME} PROPERTIES + COMPILE_DEFINITIONS gismo_EXPORTS + POSITION_INDEPENDENT_CODE ON + LINKER_LANGUAGE CXX + FOLDER "G+Smo extensions" ) + +if( GISMO_WITH_MPI ) + target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) +else() + add_definitions("-Dbraid_SEQUENTIAL") +endif() + +set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ + CACHE INTERNAL "Gismo extensions to be included") + +install(DIRECTORY ${PROJECT_SOURCE_DIR} + DESTINATION include/gismo/gsXBraid + FILES_MATCHING PATTERN "*.h") diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h new file mode 100644 index 0000000000..0744576f16 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.h @@ -0,0 +1,99 @@ +/** @file gsXBraid.h + + @brief Provides declarations of the XBraid wrapper + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include + +namespace gismo { + + /** + \brief Class defining the XBraid wrapper + */ + + class gsXBraid : public BraidApp + { + public: + + // Default constructor + gsXBraid() = delete; + + // Constructor + gsXBraid(MPI_Comm comm, + int rank, + double start, + double stop, + int timesteps); + + // Destructor + virtual ~gsXBraid(); + + // Define all the Braid Wrapper routines + virtual int Step(braid_Vector u_, + braid_Vector ustop_, + braid_Vector fstop_, + BraidStepStatus &pstatus) = 0; + + virtual int Clone(braid_Vector u_, + braid_Vector *v_ptr) = 0 ; + + virtual int Init(double t, + braid_Vector *u_ptr) = 0; + + virtual int Free(braid_Vector u_) = 0; + + virtual int Sum(double alpha, + braid_Vector x_, + double beta, + braid_Vector y_) = 0; + + virtual int SpatialNorm(braid_Vector u_, + double *norm_ptr) = 0; + + virtual int BufSize(int *size_ptr, + BraidBufferStatus &status) = 0; + + virtual int BufPack(braid_Vector u_, + void *buffer, + BraidBufferStatus &status) = 0; + + virtual int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + virtual int Access(braid_Vector u_, + BraidAccessStatus &astatus) = 0; + + // Not needed in this example + virtual int Residual(braid_Vector u_, + braid_Vector r_, + BraidStepStatus &pstatus) = 0; + + // Not needed in this example + virtual int Coarsen(braid_Vector fu_, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) = 0; + + // Not needed in this example + virtual int Refine(braid_Vector cu_, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) = 0; + + protected: + int rank; + + }; + +}// namespace gismo + +#ifndef GISMO_BUILD_LIB +#include GISMO_HPP_HEADER(gsXBraid.hpp) +#endif diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp new file mode 100644 index 0000000000..05581177b0 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -0,0 +1,18 @@ +/** @file gsXBraid.hpp + + @brief Provides implementations of the XBraid wrapper. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +namespace gismo { + +}// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp new file mode 100644 index 0000000000..d67c74258c --- /dev/null +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -0,0 +1,4 @@ + +#include +#include +#include From 29477e1e4915a812e618224fab84ea3f871fe741 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 13 Nov 2020 17:48:34 +0100 Subject: [PATCH 003/174] Fixed bug in XBraid extension --- extensions/gsXBraid/CMakeLists.txt | 46 ++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 44a1567e38..a93f6e5f71 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -33,9 +33,46 @@ if (NOT XBRAID_FOUND) endif() endif() - aux_header_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_HEADERS_XBRAID) - aux_source_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_SOURCES_XBRAID) - aux_tmpl_header_directory(${gismo_externals} ${PROJECT_NAME}_HPPFILES_XBRAID) + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/_braid.h + ${gismo_externals}/XBraid/braid/base.h + ${gismo_externals}/XBraid/braid/status.h + ${gismo_externals}/XBraid/braid/tape.h + ${gismo_externals}/XBraid/braid/util.h + ${gismo_externals}/XBraid/braid/braid.h + ${gismo_externals}/XBraid/braid/braid_status.h + ${gismo_externals}/XBraid/braid/braid_test.h) + + if(NOT GISMO_WITH_MPI ) + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/mpistubs.h) + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/mpistubs.c) + endif() + + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/access.c + ${gismo_externals}/XBraid/braid/adjoint.c + ${gismo_externals}/XBraid/braid/base.c + ${gismo_externals}/XBraid/braid/braid.c + ${gismo_externals}/XBraid/braid/braid_status.c + ${gismo_externals}/XBraid/braid/braid_test.c + ${gismo_externals}/XBraid/braid/communication.c + ${gismo_externals}/XBraid/braid/distribution.c + ${gismo_externals}/XBraid/braid/drive.c + ${gismo_externals}/XBraid/braid/grid.c + ${gismo_externals}/XBraid/braid/hierarchy.c + ${gismo_externals}/XBraid/braid/interp.c + ${gismo_externals}/XBraid/braid/norm.c + ${gismo_externals}/XBraid/braid/refine.c + ${gismo_externals}/XBraid/braid/relax.c + ${gismo_externals}/XBraid/braid/residual.c + ${gismo_externals}/XBraid/braid/restrict.c + ${gismo_externals}/XBraid/braid/space.c + ${gismo_externals}/XBraid/braid/step.c + ${gismo_externals}/XBraid/braid/tape.c + ${gismo_externals}/XBraid/braid/util.c + ${gismo_externals}/XBraid/braid/uvector.c) include_directories(${gismo_externals}/XBraid/braid) @@ -46,9 +83,6 @@ add_library(${PROJECT_NAME} OBJECT ${${PROJECT_NAME}_HEADERS} ${${PROJECT_NAME}_HPPFILES} ${${PROJECT_NAME}_SOURCES} - ${${PROJECT_NAME}_HEADERS_XBRAID} - ${${PROJECT_NAME}_HPPFILES_XBRAID} - ${${PROJECT_NAME}_SOURCES_XBRAID} ) set_target_properties(${PROJECT_NAME} PROPERTIES From de932727af980f3b541d63f9d5be5ae8b18669cb Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 20 Nov 2020 23:23:58 +0100 Subject: [PATCH 004/174] Updates gsXBraid extension and added xbraid_example --- CMakeLists.txt | 3 - examples/xbraid_example.cpp | 147 +++++++++++++++++++++++++ extensions/gsXBraid/CMakeLists.txt | 32 +++++- extensions/gsXBraid/gsXBraid.h | 171 +++++++++++++++++++++-------- extensions/gsXBraid/gsXBraid.hpp | 21 ++++ extensions/gsXBraid/gsXBraid_.cpp | 7 ++ src/gsCore/gsConfig.h.in | 1 + 7 files changed, 326 insertions(+), 56 deletions(-) create mode 100644 examples/xbraid_example.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 988e8a80bf..3034a474c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,9 +267,6 @@ endif() if(GISMO_WITH_XBRAID) add_subdirectory(extensions/gsXBraid) - set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} - CACHE INTERNAL "${PROJECT_NAME} include directories") - #include_directories(${XBRAID_INCLUDE_DIR}) endif(GISMO_WITH_XBRAID) #second time diff --git a/examples/xbraid_example.cpp b/examples/xbraid_example.cpp new file mode 100644 index 0000000000..b48cb209d8 --- /dev/null +++ b/examples/xbraid_example.cpp @@ -0,0 +1,147 @@ +/** @file xbraid_example.cpp + + @brief XBraid integration + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): A. Mantzaflaris, M. Moeller +*/ + +#include +#include + +using namespace gismo; + +#ifdef GISMO_WITH_XBRAID + +namespace gismo { + +/** + \brief Derived class implementing the XBraid wrapper for the heat equation +*/ +template +class gsXBraid_app : public gsXBraid +{ + public: + /// Inherit all constructors from base class + using gsXBraid::gsXBraid; + + /// Creates instance from command line argument + static inline gsXBraid_app create(const gsMpiComm& comm, + int argc, + char** argv) + { + index_t numRefine = 5; + index_t numElevate = 0; + index_t numTime = 1; + T tfinal = 1.0; + std::string fn("pde/poisson2d_bvp.xml"); + + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); + + cmd.addInt( "e", "degreeElevation", + "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); + cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); + cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); + cmd.addString( "f", "file", "Input XML file", fn ); + cmd.addReal( "t", "time", "Final time", tfinal ); + + cmd.getValues(argc,argv); + + return gsXBraid_app(comm, 0.0, tfinal, numTime); + } + + /// Destructor + ~gsXBraid_app() + {} + + int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &pstatus) override + {} + + int Clone(braid_Vector u, + braid_Vector *v_ptr) override + {} + + int Init(T t, + braid_Vector *u_ptr) override + {} + + int Free(braid_Vector u) override + {} + + int Sum(T alpha, + braid_Vector x, + T beta, + braid_Vector y) override + {} + + int SpatialNorm(braid_Vector u, + T *norm_ptr) override + {} + + int BufSize(index_t *size_ptr, + BraidBufferStatus &status) override + {} + + int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) override + {} + + int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) override + {} + + int Access(braid_Vector u, + BraidAccessStatus &astatus) override + {} + + // Not needed in this example + int Residual(braid_Vector u, + braid_Vector r, + BraidStepStatus &pstatus) override + {} + + // Not needed in this example + int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) override + {} + + // Not needed in this example + int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) override + {} +}; + +} // ending namespace gismo + +#endif + +int main(int argc, char**argv) +{ + // Initialize the MPI environment and obtain the world communicator + gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); + +#ifdef GISMO_WITH_XBRAID + + // Set up app structure + gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); + + // Perform parallel-in-time multigrid + app.solve(); + +#endif + + return 0; + +} diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index a93f6e5f71..ac95437874 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -15,14 +15,22 @@ aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) # Apply same configuration as G+Smo include(gsConfig) +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + add_definitions(-D_CRT_NONSTDC_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) +endif() + # Look for pre-installed XBraid libraries find_package(XBRAID QUIET) if (NOT XBRAID_FOUND) - + # Set XBraid version set(XBRAID_VER "v3.0.0") + + # Download XBraid sources at configure time + include(gsFetch) gismo_fetch_directory(XBraid - URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip DESTINATION external ) @@ -33,6 +41,7 @@ if (NOT XBRAID_FOUND) endif() endif() + # Set XBraid library header files set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" ${gismo_externals}/XBraid/braid/_braid.h ${gismo_externals}/XBraid/braid/base.h @@ -50,6 +59,7 @@ if (NOT XBRAID_FOUND) ${gismo_externals}/XBraid/braid/mpistubs.c) endif() + # Set XBraid library sources files set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" ${gismo_externals}/XBraid/braid/access.c ${gismo_externals}/XBraid/braid/adjoint.c @@ -74,22 +84,26 @@ if (NOT XBRAID_FOUND) ${gismo_externals}/XBraid/braid/util.c ${gismo_externals}/XBraid/braid/uvector.c) - include_directories(${gismo_externals}/XBraid/braid) +# Set XBraid library include files + set(XBRAID_INCLUDE_DIR ${gismo_externals}/XBraid/braid CACHE INTERNAL "") + include_directories(${XBRAID_INCLUDE_DIR}) endif (NOT XBRAID_FOUND) -# Add object library +# Compile gsXBraid extension as part of the G+Smo library add_library(${PROJECT_NAME} OBJECT ${${PROJECT_NAME}_HEADERS} ${${PROJECT_NAME}_HPPFILES} ${${PROJECT_NAME}_SOURCES} ) +# Set standard properties for all G+Smo extensions set_target_properties(${PROJECT_NAME} PROPERTIES COMPILE_DEFINITIONS gismo_EXPORTS POSITION_INDEPENDENT_CODE ON LINKER_LANGUAGE CXX - FOLDER "G+Smo extensions" ) + FOLDER "G+Smo extensions" + ) if( GISMO_WITH_MPI ) target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) @@ -97,9 +111,15 @@ else() add_definitions("-Dbraid_SEQUENTIAL") endif() +# Add gsXBraid extension to the list of G+Smo extensions set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ - CACHE INTERNAL "Gismo extensions to be included") + CACHE INTERNAL "gismo extensions to be included") + +# Add XBraid include directories to G+Smo standard include directories +set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "gismo include directories") +# Install gsXBraid header files install(DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION include/gismo/gsXBraid FILES_MATCHING PATTERN "*.h") diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index 0744576f16..ed43585490 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -11,6 +11,15 @@ Author(s): M. Moller */ +#pragma once + +#include +#include + +#if !defined(GISMO_WITH_MPI) +#define braid_SEQUENTIAL 1 +#endif + #include namespace gismo { @@ -19,77 +28,145 @@ namespace gismo { \brief Class defining the XBraid wrapper */ + template class gsXBraid : public BraidApp { public: - // Default constructor - gsXBraid() = delete; - - // Constructor - gsXBraid(MPI_Comm comm, - int rank, - double start, - double stop, - int timesteps); + /// Constructor + gsXBraid(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + int ntime); - // Destructor + /// Destructor virtual ~gsXBraid(); - // Define all the Braid Wrapper routines - virtual int Step(braid_Vector u_, - braid_Vector ustop_, - braid_Vector fstop_, + // Performs one time step + virtual int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, BraidStepStatus &pstatus) = 0; - - virtual int Clone(braid_Vector u_, + + // Clones the given vectors + virtual int Clone(braid_Vector u, braid_Vector *v_ptr) = 0 ; - - virtual int Init(double t, + + // Initializes the given vector + virtual int Init(T t, braid_Vector *u_ptr) = 0; - - virtual int Free(braid_Vector u_) = 0; - - virtual int Sum(double alpha, - braid_Vector x_, - double beta, - braid_Vector y_) = 0; - virtual int SpatialNorm(braid_Vector u_, - double *norm_ptr) = 0; + // Fianlizes the given vector + virtual int Free(braid_Vector u) = 0; + + // Computes the weighted sum of two given vectors + virtual int Sum(T alpha, + braid_Vector x, + T beta, + braid_Vector y) = 0; - virtual int BufSize(int *size_ptr, - BraidBufferStatus &status) = 0; + // Computes the spatial norm of the given vector + virtual int SpatialNorm(braid_Vector u, + T *norm_ptr) = 0; - virtual int BufPack(braid_Vector u_, - void *buffer, - BraidBufferStatus &status) = 0; + // Computes the buffer size + virtual int BufSize(index_t *size_ptr, + BraidBufferStatus &status) = 0; - virtual int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) = 0; + // Packes the given vector into the given buffer + virtual int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) = 0; - virtual int Access(braid_Vector u_, + // Unpacks the given buffer into the given vector + virtual int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + // Accesses the given vector + virtual int Access(braid_Vector u, BraidAccessStatus &astatus) = 0; - // Not needed in this example - virtual int Residual(braid_Vector u_, - braid_Vector r_, + // Calculates the residual + virtual int Residual(braid_Vector u, + braid_Vector r, BraidStepStatus &pstatus) = 0; - // Not needed in this example - virtual int Coarsen(braid_Vector fu_, - braid_Vector *cu_ptr, + /// Performs coarsening in time + virtual int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, BraidCoarsenRefStatus &status) = 0; - // Not needed in this example - virtual int Refine(braid_Vector cu_, - braid_Vector *fu_ptr, + /// Performs refinement in time + virtual int Refine(braid_Vector cu, + braid_Vector *fu_ptr, BraidCoarsenRefStatus &status) = 0; + + /// Runs the parallel-in-time multigrid solver + void solve() { core.Drive(); } - protected: - int rank; + public: + void SetMaxLevels(int max_levels) { core.SetMaxLevels(max_levels); } + + void SetIncrMaxLevels() { core.SetIncrMaxLevels(); } + + void SetSkip(int skip) { core.SetSkip(skip); } + + void SetMinCoarse(int min_coarse) { core.SetMinCoarse(min_coarse); } + + void SetNRelax(int level, int nrelax) { core.SetNRelax(level, nrelax); } + + void SetAbsTol(T tol) { core.SetAbsTol(tol); } + + void SetRelTol(T tol) { core.SetRelTol(tol); } + + void SetTemporalNorm(int tnorm) { core.SetTemporalNorm(tnorm); } + void SetCFactor(int level, int cfactor) { core.SetCFactor(level, cfactor); } + + void SetAggCFactor(int cfactor0) { core.SetAggCFactor(cfactor0); } + + void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } + + void SetPeriodic(int periodic) { core.SetPeriodic(periodic); } + + void SetSync() { core.SetSync(); } + + void SetResidual() { core.SetResidual(); } + + void SetMaxIter(int max_iter) { core.SetMaxIter(max_iter); } + + void SetPrintLevel(int print_level) { core.SetPrintLevel(print_level); } + + void SetSeqSoln(int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } + + void SetPrintFile(const char *printfile_name) { core.SetPrintFile(printfile_name); } + + void SetAccessLevel(int access_level) { core.SetAccessLevel(access_level); } + + void SetFMG() { core.SetFMG(); } + + void SetNFMG(int k) { core.SetNFMG(k); } + + void SetNFMGVcyc(int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } + + void SetStorage(int storage) { core.SetStorage(storage); } + + void SetRefine(int refine) {core.SetRefine(refine);} + + void SetMaxRefinements(int max_refinements) {core.SetMaxRefinements(max_refinements);} + + void SetRichardsonEstimation(int est_error, int richardson, int local_order) { core.SetRichardsonEstimation(est_error, richardson, local_order); } + + void GetNumIter(int *niter_ptr) { core.GetNumIter(niter_ptr); } + + void GetRNorms(int *nrequest_ptr, double *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } + + void GetNLevels(int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + + protected: + /// Braid Core object + BraidCore core; }; }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp index 05581177b0..b7a4369c77 100644 --- a/extensions/gsXBraid/gsXBraid.hpp +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -13,6 +13,27 @@ #pragma once +#include + namespace gismo { + // Constructor + template + gsXBraid::gsXBraid(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + int ntime) + : BraidApp(static_cast(comm), double(tstart), double(tstop), ntime), + core(static_cast(comm), this) + { + std::cout << "gsXBraid constructor called\n"; + } + + // Destructor + template + gsXBraid::~gsXBraid() + { + std::cout << "gsXBraid destructor called\n"; + } + }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp index d67c74258c..06a4edfd56 100644 --- a/extensions/gsXBraid/gsXBraid_.cpp +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -2,3 +2,10 @@ #include #include #include + +namespace gismo +{ + +CLASS_TEMPLATE_INST gsXBraid; + +} diff --git a/src/gsCore/gsConfig.h.in b/src/gsCore/gsConfig.h.in index b5858be5a8..2dc11d059a 100644 --- a/src/gsCore/gsConfig.h.in +++ b/src/gsCore/gsConfig.h.in @@ -58,6 +58,7 @@ #cmakedefine GISMO_WITH_TRILINOS #cmakedefine GISMO_WITH_UMFPACK #cmakedefine GISMO_WITH_UNUM +#cmakedefine GISMO_WITH_XBRAID /* Only include new types here that can be set as real_t */ From 2939e4ec2540749fa97598f8cd4e8633e5f7848b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 3 Nov 2020 16:56:39 +0100 Subject: [PATCH 005/174] Added gsXBraid extension --- CMakeLists.txt | 7 +++ cmake/gsOptions.cmake | 5 ++ extensions/gsXBraid/CMakeLists.txt | 71 +++++++++++++++++++++ extensions/gsXBraid/gsXBraid.h | 99 ++++++++++++++++++++++++++++++ extensions/gsXBraid/gsXBraid.hpp | 18 ++++++ extensions/gsXBraid/gsXBraid_.cpp | 4 ++ 6 files changed, 204 insertions(+) create mode 100644 extensions/gsXBraid/CMakeLists.txt create mode 100644 extensions/gsXBraid/gsXBraid.h create mode 100644 extensions/gsXBraid/gsXBraid.hpp create mode 100644 extensions/gsXBraid/gsXBraid_.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dd5fcd0ec7..988e8a80bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -265,6 +265,13 @@ if(GISMO_WITH_SMESH) #include_directories(${SMESH_INCLUDE_DIR}) endif() +if(GISMO_WITH_XBRAID) + add_subdirectory(extensions/gsXBraid) + set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "${PROJECT_NAME} include directories") + #include_directories(${XBRAID_INCLUDE_DIR}) +endif(GISMO_WITH_XBRAID) + #second time include_directories(${GISMO_INCLUDE_DIRS}) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index 2e96735736..2006a6a1e1 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -174,6 +174,11 @@ if (${GISMO_WITH_UNUM}) message (" GISMO_WITH_UNUM ${GISMO_WITH_UNUM}") endif() +option(GISMO_WITH_XBRAID "With XBraid" false ) +if (${GISMO_WITH_XBRAID}) +message (" GISMO_WITH_XBRAID ${GISMO_WITH_XBRAID}") +endif() + ## ################################################################# ## Options list: Extra options ## ################################################################# diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt new file mode 100644 index 0000000000..44a1567e38 --- /dev/null +++ b/extensions/gsXBraid/CMakeLists.txt @@ -0,0 +1,71 @@ +### CMakeLists.txt --- +## +## Author: Angelos Mantzaflaris +## Copyright (C) 2016 - RICAM-Linz. +###################################################################### + +## XBraid extension +project(gsXBraidExtension) + +# Collect file names +aux_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HEADERS) +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_SOURCES) +aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) + +# Apply same configuration as G+Smo +include(gsConfig) + +# Look for pre-installed XBraid libraries +find_package(XBRAID QUIET) + +if (NOT XBRAID_FOUND) + + set(XBRAID_VER "v3.0.0") + gismo_fetch_directory(XBraid + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + DESTINATION external + ) + + if( (NOT GISMO_BUILD_LIB) ) + aux_instance_directory (${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_INS) + if(${PROJECT_NAME}_INS) + LIST( REMOVE_ITEM ${PROJECT_NAME}_CPP ${${PROJECT_NAME}_INS}) + endif() + endif() + + aux_header_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_HEADERS_XBRAID) + aux_source_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_SOURCES_XBRAID) + aux_tmpl_header_directory(${gismo_externals} ${PROJECT_NAME}_HPPFILES_XBRAID) + + include_directories(${gismo_externals}/XBraid/braid) + +endif (NOT XBRAID_FOUND) + +# Add object library +add_library(${PROJECT_NAME} OBJECT + ${${PROJECT_NAME}_HEADERS} + ${${PROJECT_NAME}_HPPFILES} + ${${PROJECT_NAME}_SOURCES} + ${${PROJECT_NAME}_HEADERS_XBRAID} + ${${PROJECT_NAME}_HPPFILES_XBRAID} + ${${PROJECT_NAME}_SOURCES_XBRAID} + ) + +set_target_properties(${PROJECT_NAME} PROPERTIES + COMPILE_DEFINITIONS gismo_EXPORTS + POSITION_INDEPENDENT_CODE ON + LINKER_LANGUAGE CXX + FOLDER "G+Smo extensions" ) + +if( GISMO_WITH_MPI ) + target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) +else() + add_definitions("-Dbraid_SEQUENTIAL") +endif() + +set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ + CACHE INTERNAL "Gismo extensions to be included") + +install(DIRECTORY ${PROJECT_SOURCE_DIR} + DESTINATION include/gismo/gsXBraid + FILES_MATCHING PATTERN "*.h") diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h new file mode 100644 index 0000000000..0744576f16 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.h @@ -0,0 +1,99 @@ +/** @file gsXBraid.h + + @brief Provides declarations of the XBraid wrapper + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include + +namespace gismo { + + /** + \brief Class defining the XBraid wrapper + */ + + class gsXBraid : public BraidApp + { + public: + + // Default constructor + gsXBraid() = delete; + + // Constructor + gsXBraid(MPI_Comm comm, + int rank, + double start, + double stop, + int timesteps); + + // Destructor + virtual ~gsXBraid(); + + // Define all the Braid Wrapper routines + virtual int Step(braid_Vector u_, + braid_Vector ustop_, + braid_Vector fstop_, + BraidStepStatus &pstatus) = 0; + + virtual int Clone(braid_Vector u_, + braid_Vector *v_ptr) = 0 ; + + virtual int Init(double t, + braid_Vector *u_ptr) = 0; + + virtual int Free(braid_Vector u_) = 0; + + virtual int Sum(double alpha, + braid_Vector x_, + double beta, + braid_Vector y_) = 0; + + virtual int SpatialNorm(braid_Vector u_, + double *norm_ptr) = 0; + + virtual int BufSize(int *size_ptr, + BraidBufferStatus &status) = 0; + + virtual int BufPack(braid_Vector u_, + void *buffer, + BraidBufferStatus &status) = 0; + + virtual int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + virtual int Access(braid_Vector u_, + BraidAccessStatus &astatus) = 0; + + // Not needed in this example + virtual int Residual(braid_Vector u_, + braid_Vector r_, + BraidStepStatus &pstatus) = 0; + + // Not needed in this example + virtual int Coarsen(braid_Vector fu_, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) = 0; + + // Not needed in this example + virtual int Refine(braid_Vector cu_, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) = 0; + + protected: + int rank; + + }; + +}// namespace gismo + +#ifndef GISMO_BUILD_LIB +#include GISMO_HPP_HEADER(gsXBraid.hpp) +#endif diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp new file mode 100644 index 0000000000..05581177b0 --- /dev/null +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -0,0 +1,18 @@ +/** @file gsXBraid.hpp + + @brief Provides implementations of the XBraid wrapper. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +namespace gismo { + +}// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp new file mode 100644 index 0000000000..d67c74258c --- /dev/null +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -0,0 +1,4 @@ + +#include +#include +#include From ae8cdf23c65bb94a9ed9d6201ddad57a54c2e86b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 13 Nov 2020 17:48:34 +0100 Subject: [PATCH 006/174] Fixed bug in XBraid extension --- extensions/gsXBraid/CMakeLists.txt | 46 ++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 44a1567e38..a93f6e5f71 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -33,9 +33,46 @@ if (NOT XBRAID_FOUND) endif() endif() - aux_header_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_HEADERS_XBRAID) - aux_source_directory(${gismo_externals}/XBraid/braid ${PROJECT_NAME}_SOURCES_XBRAID) - aux_tmpl_header_directory(${gismo_externals} ${PROJECT_NAME}_HPPFILES_XBRAID) + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/_braid.h + ${gismo_externals}/XBraid/braid/base.h + ${gismo_externals}/XBraid/braid/status.h + ${gismo_externals}/XBraid/braid/tape.h + ${gismo_externals}/XBraid/braid/util.h + ${gismo_externals}/XBraid/braid/braid.h + ${gismo_externals}/XBraid/braid/braid_status.h + ${gismo_externals}/XBraid/braid/braid_test.h) + + if(NOT GISMO_WITH_MPI ) + set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" + ${gismo_externals}/XBraid/braid/mpistubs.h) + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/mpistubs.c) + endif() + + set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" + ${gismo_externals}/XBraid/braid/access.c + ${gismo_externals}/XBraid/braid/adjoint.c + ${gismo_externals}/XBraid/braid/base.c + ${gismo_externals}/XBraid/braid/braid.c + ${gismo_externals}/XBraid/braid/braid_status.c + ${gismo_externals}/XBraid/braid/braid_test.c + ${gismo_externals}/XBraid/braid/communication.c + ${gismo_externals}/XBraid/braid/distribution.c + ${gismo_externals}/XBraid/braid/drive.c + ${gismo_externals}/XBraid/braid/grid.c + ${gismo_externals}/XBraid/braid/hierarchy.c + ${gismo_externals}/XBraid/braid/interp.c + ${gismo_externals}/XBraid/braid/norm.c + ${gismo_externals}/XBraid/braid/refine.c + ${gismo_externals}/XBraid/braid/relax.c + ${gismo_externals}/XBraid/braid/residual.c + ${gismo_externals}/XBraid/braid/restrict.c + ${gismo_externals}/XBraid/braid/space.c + ${gismo_externals}/XBraid/braid/step.c + ${gismo_externals}/XBraid/braid/tape.c + ${gismo_externals}/XBraid/braid/util.c + ${gismo_externals}/XBraid/braid/uvector.c) include_directories(${gismo_externals}/XBraid/braid) @@ -46,9 +83,6 @@ add_library(${PROJECT_NAME} OBJECT ${${PROJECT_NAME}_HEADERS} ${${PROJECT_NAME}_HPPFILES} ${${PROJECT_NAME}_SOURCES} - ${${PROJECT_NAME}_HEADERS_XBRAID} - ${${PROJECT_NAME}_HPPFILES_XBRAID} - ${${PROJECT_NAME}_SOURCES_XBRAID} ) set_target_properties(${PROJECT_NAME} PROPERTIES From 514464c4192702efe754e88e2edfac84d60a505c Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 20 Nov 2020 23:23:58 +0100 Subject: [PATCH 007/174] Updates gsXBraid extension and added xbraid_example --- CMakeLists.txt | 3 - examples/xbraid_example.cpp | 147 +++++++++++++++++++++++++ extensions/gsXBraid/CMakeLists.txt | 32 +++++- extensions/gsXBraid/gsXBraid.h | 171 +++++++++++++++++++++-------- extensions/gsXBraid/gsXBraid.hpp | 21 ++++ extensions/gsXBraid/gsXBraid_.cpp | 7 ++ src/gsCore/gsConfig.h.in | 1 + 7 files changed, 326 insertions(+), 56 deletions(-) create mode 100644 examples/xbraid_example.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 988e8a80bf..3034a474c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,9 +267,6 @@ endif() if(GISMO_WITH_XBRAID) add_subdirectory(extensions/gsXBraid) - set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} - CACHE INTERNAL "${PROJECT_NAME} include directories") - #include_directories(${XBRAID_INCLUDE_DIR}) endif(GISMO_WITH_XBRAID) #second time diff --git a/examples/xbraid_example.cpp b/examples/xbraid_example.cpp new file mode 100644 index 0000000000..b48cb209d8 --- /dev/null +++ b/examples/xbraid_example.cpp @@ -0,0 +1,147 @@ +/** @file xbraid_example.cpp + + @brief XBraid integration + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): A. Mantzaflaris, M. Moeller +*/ + +#include +#include + +using namespace gismo; + +#ifdef GISMO_WITH_XBRAID + +namespace gismo { + +/** + \brief Derived class implementing the XBraid wrapper for the heat equation +*/ +template +class gsXBraid_app : public gsXBraid +{ + public: + /// Inherit all constructors from base class + using gsXBraid::gsXBraid; + + /// Creates instance from command line argument + static inline gsXBraid_app create(const gsMpiComm& comm, + int argc, + char** argv) + { + index_t numRefine = 5; + index_t numElevate = 0; + index_t numTime = 1; + T tfinal = 1.0; + std::string fn("pde/poisson2d_bvp.xml"); + + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); + + cmd.addInt( "e", "degreeElevation", + "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); + cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); + cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); + cmd.addString( "f", "file", "Input XML file", fn ); + cmd.addReal( "t", "time", "Final time", tfinal ); + + cmd.getValues(argc,argv); + + return gsXBraid_app(comm, 0.0, tfinal, numTime); + } + + /// Destructor + ~gsXBraid_app() + {} + + int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &pstatus) override + {} + + int Clone(braid_Vector u, + braid_Vector *v_ptr) override + {} + + int Init(T t, + braid_Vector *u_ptr) override + {} + + int Free(braid_Vector u) override + {} + + int Sum(T alpha, + braid_Vector x, + T beta, + braid_Vector y) override + {} + + int SpatialNorm(braid_Vector u, + T *norm_ptr) override + {} + + int BufSize(index_t *size_ptr, + BraidBufferStatus &status) override + {} + + int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) override + {} + + int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) override + {} + + int Access(braid_Vector u, + BraidAccessStatus &astatus) override + {} + + // Not needed in this example + int Residual(braid_Vector u, + braid_Vector r, + BraidStepStatus &pstatus) override + {} + + // Not needed in this example + int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) override + {} + + // Not needed in this example + int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) override + {} +}; + +} // ending namespace gismo + +#endif + +int main(int argc, char**argv) +{ + // Initialize the MPI environment and obtain the world communicator + gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); + +#ifdef GISMO_WITH_XBRAID + + // Set up app structure + gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); + + // Perform parallel-in-time multigrid + app.solve(); + +#endif + + return 0; + +} diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index a93f6e5f71..ac95437874 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -15,14 +15,22 @@ aux_tmpl_header_directory(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_NAME}_HPPFILES) # Apply same configuration as G+Smo include(gsConfig) +if(CMAKE_C_COMPILER_ID MATCHES "MSVC") + add_definitions(-D_CRT_NONSTDC_NO_WARNINGS) + add_definitions(-D_CRT_SECURE_NO_WARNINGS) +endif() + # Look for pre-installed XBraid libraries find_package(XBRAID QUIET) if (NOT XBRAID_FOUND) - + # Set XBraid version set(XBRAID_VER "v3.0.0") + + # Download XBraid sources at configure time + include(gsFetch) gismo_fetch_directory(XBraid - URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip + URL https://github.com/XBraid/xbraid/archive/${XBRAID_VER}.zip DESTINATION external ) @@ -33,6 +41,7 @@ if (NOT XBRAID_FOUND) endif() endif() + # Set XBraid library header files set(${PROJECT_NAME}_HEADERS "${${PROJECT_NAME}_HEADERS}" ${gismo_externals}/XBraid/braid/_braid.h ${gismo_externals}/XBraid/braid/base.h @@ -50,6 +59,7 @@ if (NOT XBRAID_FOUND) ${gismo_externals}/XBraid/braid/mpistubs.c) endif() + # Set XBraid library sources files set(${PROJECT_NAME}_SOURCES "${${PROJECT_NAME}_SOURCES}" ${gismo_externals}/XBraid/braid/access.c ${gismo_externals}/XBraid/braid/adjoint.c @@ -74,22 +84,26 @@ if (NOT XBRAID_FOUND) ${gismo_externals}/XBraid/braid/util.c ${gismo_externals}/XBraid/braid/uvector.c) - include_directories(${gismo_externals}/XBraid/braid) +# Set XBraid library include files + set(XBRAID_INCLUDE_DIR ${gismo_externals}/XBraid/braid CACHE INTERNAL "") + include_directories(${XBRAID_INCLUDE_DIR}) endif (NOT XBRAID_FOUND) -# Add object library +# Compile gsXBraid extension as part of the G+Smo library add_library(${PROJECT_NAME} OBJECT ${${PROJECT_NAME}_HEADERS} ${${PROJECT_NAME}_HPPFILES} ${${PROJECT_NAME}_SOURCES} ) +# Set standard properties for all G+Smo extensions set_target_properties(${PROJECT_NAME} PROPERTIES COMPILE_DEFINITIONS gismo_EXPORTS POSITION_INDEPENDENT_CODE ON LINKER_LANGUAGE CXX - FOLDER "G+Smo extensions" ) + FOLDER "G+Smo extensions" + ) if( GISMO_WITH_MPI ) target_include_directories(${PROJECT_NAME} PRIVATE ${MPI_INCLUDE_PATH}) @@ -97,9 +111,15 @@ else() add_definitions("-Dbraid_SEQUENTIAL") endif() +# Add gsXBraid extension to the list of G+Smo extensions set(gismo_EXTENSIONS ${gismo_EXTENSIONS} $ - CACHE INTERNAL "Gismo extensions to be included") + CACHE INTERNAL "gismo extensions to be included") + +# Add XBraid include directories to G+Smo standard include directories +set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} + CACHE INTERNAL "gismo include directories") +# Install gsXBraid header files install(DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION include/gismo/gsXBraid FILES_MATCHING PATTERN "*.h") diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index 0744576f16..ed43585490 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -11,6 +11,15 @@ Author(s): M. Moller */ +#pragma once + +#include +#include + +#if !defined(GISMO_WITH_MPI) +#define braid_SEQUENTIAL 1 +#endif + #include namespace gismo { @@ -19,77 +28,145 @@ namespace gismo { \brief Class defining the XBraid wrapper */ + template class gsXBraid : public BraidApp { public: - // Default constructor - gsXBraid() = delete; - - // Constructor - gsXBraid(MPI_Comm comm, - int rank, - double start, - double stop, - int timesteps); + /// Constructor + gsXBraid(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + int ntime); - // Destructor + /// Destructor virtual ~gsXBraid(); - // Define all the Braid Wrapper routines - virtual int Step(braid_Vector u_, - braid_Vector ustop_, - braid_Vector fstop_, + // Performs one time step + virtual int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, BraidStepStatus &pstatus) = 0; - - virtual int Clone(braid_Vector u_, + + // Clones the given vectors + virtual int Clone(braid_Vector u, braid_Vector *v_ptr) = 0 ; - - virtual int Init(double t, + + // Initializes the given vector + virtual int Init(T t, braid_Vector *u_ptr) = 0; - - virtual int Free(braid_Vector u_) = 0; - - virtual int Sum(double alpha, - braid_Vector x_, - double beta, - braid_Vector y_) = 0; - virtual int SpatialNorm(braid_Vector u_, - double *norm_ptr) = 0; + // Fianlizes the given vector + virtual int Free(braid_Vector u) = 0; + + // Computes the weighted sum of two given vectors + virtual int Sum(T alpha, + braid_Vector x, + T beta, + braid_Vector y) = 0; - virtual int BufSize(int *size_ptr, - BraidBufferStatus &status) = 0; + // Computes the spatial norm of the given vector + virtual int SpatialNorm(braid_Vector u, + T *norm_ptr) = 0; - virtual int BufPack(braid_Vector u_, - void *buffer, - BraidBufferStatus &status) = 0; + // Computes the buffer size + virtual int BufSize(index_t *size_ptr, + BraidBufferStatus &status) = 0; - virtual int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) = 0; + // Packes the given vector into the given buffer + virtual int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) = 0; - virtual int Access(braid_Vector u_, + // Unpacks the given buffer into the given vector + virtual int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + // Accesses the given vector + virtual int Access(braid_Vector u, BraidAccessStatus &astatus) = 0; - // Not needed in this example - virtual int Residual(braid_Vector u_, - braid_Vector r_, + // Calculates the residual + virtual int Residual(braid_Vector u, + braid_Vector r, BraidStepStatus &pstatus) = 0; - // Not needed in this example - virtual int Coarsen(braid_Vector fu_, - braid_Vector *cu_ptr, + /// Performs coarsening in time + virtual int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, BraidCoarsenRefStatus &status) = 0; - // Not needed in this example - virtual int Refine(braid_Vector cu_, - braid_Vector *fu_ptr, + /// Performs refinement in time + virtual int Refine(braid_Vector cu, + braid_Vector *fu_ptr, BraidCoarsenRefStatus &status) = 0; + + /// Runs the parallel-in-time multigrid solver + void solve() { core.Drive(); } - protected: - int rank; + public: + void SetMaxLevels(int max_levels) { core.SetMaxLevels(max_levels); } + + void SetIncrMaxLevels() { core.SetIncrMaxLevels(); } + + void SetSkip(int skip) { core.SetSkip(skip); } + + void SetMinCoarse(int min_coarse) { core.SetMinCoarse(min_coarse); } + + void SetNRelax(int level, int nrelax) { core.SetNRelax(level, nrelax); } + + void SetAbsTol(T tol) { core.SetAbsTol(tol); } + + void SetRelTol(T tol) { core.SetRelTol(tol); } + + void SetTemporalNorm(int tnorm) { core.SetTemporalNorm(tnorm); } + void SetCFactor(int level, int cfactor) { core.SetCFactor(level, cfactor); } + + void SetAggCFactor(int cfactor0) { core.SetAggCFactor(cfactor0); } + + void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } + + void SetPeriodic(int periodic) { core.SetPeriodic(periodic); } + + void SetSync() { core.SetSync(); } + + void SetResidual() { core.SetResidual(); } + + void SetMaxIter(int max_iter) { core.SetMaxIter(max_iter); } + + void SetPrintLevel(int print_level) { core.SetPrintLevel(print_level); } + + void SetSeqSoln(int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } + + void SetPrintFile(const char *printfile_name) { core.SetPrintFile(printfile_name); } + + void SetAccessLevel(int access_level) { core.SetAccessLevel(access_level); } + + void SetFMG() { core.SetFMG(); } + + void SetNFMG(int k) { core.SetNFMG(k); } + + void SetNFMGVcyc(int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } + + void SetStorage(int storage) { core.SetStorage(storage); } + + void SetRefine(int refine) {core.SetRefine(refine);} + + void SetMaxRefinements(int max_refinements) {core.SetMaxRefinements(max_refinements);} + + void SetRichardsonEstimation(int est_error, int richardson, int local_order) { core.SetRichardsonEstimation(est_error, richardson, local_order); } + + void GetNumIter(int *niter_ptr) { core.GetNumIter(niter_ptr); } + + void GetRNorms(int *nrequest_ptr, double *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } + + void GetNLevels(int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + + protected: + /// Braid Core object + BraidCore core; }; }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp index 05581177b0..b7a4369c77 100644 --- a/extensions/gsXBraid/gsXBraid.hpp +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -13,6 +13,27 @@ #pragma once +#include + namespace gismo { + // Constructor + template + gsXBraid::gsXBraid(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + int ntime) + : BraidApp(static_cast(comm), double(tstart), double(tstop), ntime), + core(static_cast(comm), this) + { + std::cout << "gsXBraid constructor called\n"; + } + + // Destructor + template + gsXBraid::~gsXBraid() + { + std::cout << "gsXBraid destructor called\n"; + } + }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp index d67c74258c..06a4edfd56 100644 --- a/extensions/gsXBraid/gsXBraid_.cpp +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -2,3 +2,10 @@ #include #include #include + +namespace gismo +{ + +CLASS_TEMPLATE_INST gsXBraid; + +} diff --git a/src/gsCore/gsConfig.h.in b/src/gsCore/gsConfig.h.in index b5858be5a8..2dc11d059a 100644 --- a/src/gsCore/gsConfig.h.in +++ b/src/gsCore/gsConfig.h.in @@ -58,6 +58,7 @@ #cmakedefine GISMO_WITH_TRILINOS #cmakedefine GISMO_WITH_UMFPACK #cmakedefine GISMO_WITH_UNUM +#cmakedefine GISMO_WITH_XBRAID /* Only include new types here that can be set as real_t */ From a518a5775c45a7900e78b450efb8ce15869d848f Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sat, 21 Nov 2020 08:53:04 +0100 Subject: [PATCH 008/174] Added dummy MPI_Comm in sequential mode --- src/gsMpi/gsMpiComm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gsMpi/gsMpiComm.h b/src/gsMpi/gsMpiComm.h index 221409e8a4..bc63bef716 100644 --- a/src/gsMpi/gsMpiComm.h +++ b/src/gsMpi/gsMpiComm.h @@ -19,6 +19,7 @@ namespace gismo { #ifndef GISMO_WITH_MPI +typedef int MPI_Comm; typedef int MPI_Group; typedef int MPI_Request; struct MPI_Status {}; @@ -335,7 +336,8 @@ class gsSerialComm #ifdef GISMO_WITH_MPI operator MPI_Comm () const { return MPI_COMM_SELF;} - //#else +#else + operator MPI_Comm () const { return 0;} // typedef int MPI_Group; // typedef int MPI_Request; // struct MPI_Status {}; From ac54a6396fdf46dc296b9417607295c4d6d56f7d Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sat, 21 Nov 2020 09:49:39 +0100 Subject: [PATCH 009/174] exort symbol from xbraid --- extensions/gsXBraid/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index ac95437874..57244598bc 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -102,6 +102,11 @@ set_target_properties(${PROJECT_NAME} PROPERTIES COMPILE_DEFINITIONS gismo_EXPORTS POSITION_INDEPENDENT_CODE ON LINKER_LANGUAGE CXX + #START Export all symbols from this extension + CXX_VISIBILITY_PRESET default + C_VISIBILITY_PRESET default + VISIBILITY_INLINES_HIDDEN 0 + #END Export all symbols from this extension FOLDER "G+Smo extensions" ) From 976f6d7844d0e4f859311b72fbeadb09e1b51c09 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 22 Nov 2020 09:46:18 +0100 Subject: [PATCH 010/174] Updated xbraid example application --- examples/xbraid_example.cpp | 147 --------------- examples/xbraid_heatEquation_example.cpp | 228 +++++++++++++++++++++++ extensions/gsXBraid/gsXBraid.h | 129 ++++++++++--- 3 files changed, 332 insertions(+), 172 deletions(-) delete mode 100644 examples/xbraid_example.cpp create mode 100644 examples/xbraid_heatEquation_example.cpp diff --git a/examples/xbraid_example.cpp b/examples/xbraid_example.cpp deleted file mode 100644 index b48cb209d8..0000000000 --- a/examples/xbraid_example.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/** @file xbraid_example.cpp - - @brief XBraid integration - - This file is part of the G+Smo library. - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. - - Author(s): A. Mantzaflaris, M. Moeller -*/ - -#include -#include - -using namespace gismo; - -#ifdef GISMO_WITH_XBRAID - -namespace gismo { - -/** - \brief Derived class implementing the XBraid wrapper for the heat equation -*/ -template -class gsXBraid_app : public gsXBraid -{ - public: - /// Inherit all constructors from base class - using gsXBraid::gsXBraid; - - /// Creates instance from command line argument - static inline gsXBraid_app create(const gsMpiComm& comm, - int argc, - char** argv) - { - index_t numRefine = 5; - index_t numElevate = 0; - index_t numTime = 1; - T tfinal = 1.0; - std::string fn("pde/poisson2d_bvp.xml"); - - gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); - - cmd.addInt( "e", "degreeElevation", - "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); - cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); - cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); - cmd.addString( "f", "file", "Input XML file", fn ); - cmd.addReal( "t", "time", "Final time", tfinal ); - - cmd.getValues(argc,argv); - - return gsXBraid_app(comm, 0.0, tfinal, numTime); - } - - /// Destructor - ~gsXBraid_app() - {} - - int Step(braid_Vector u, - braid_Vector ustop, - braid_Vector fstop, - BraidStepStatus &pstatus) override - {} - - int Clone(braid_Vector u, - braid_Vector *v_ptr) override - {} - - int Init(T t, - braid_Vector *u_ptr) override - {} - - int Free(braid_Vector u) override - {} - - int Sum(T alpha, - braid_Vector x, - T beta, - braid_Vector y) override - {} - - int SpatialNorm(braid_Vector u, - T *norm_ptr) override - {} - - int BufSize(index_t *size_ptr, - BraidBufferStatus &status) override - {} - - int BufPack(braid_Vector u, - void *buffer, - BraidBufferStatus &status) override - {} - - int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) override - {} - - int Access(braid_Vector u, - BraidAccessStatus &astatus) override - {} - - // Not needed in this example - int Residual(braid_Vector u, - braid_Vector r, - BraidStepStatus &pstatus) override - {} - - // Not needed in this example - int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) override - {} - - // Not needed in this example - int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) override - {} -}; - -} // ending namespace gismo - -#endif - -int main(int argc, char**argv) -{ - // Initialize the MPI environment and obtain the world communicator - gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); - -#ifdef GISMO_WITH_XBRAID - - // Set up app structure - gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); - - // Perform parallel-in-time multigrid - app.solve(); - -#endif - - return 0; - -} diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp new file mode 100644 index 0000000000..6425cdef75 --- /dev/null +++ b/examples/xbraid_heatEquation_example.cpp @@ -0,0 +1,228 @@ +/** @file xbraid_example.cpp + + @brief XBraid integration + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): A. Mantzaflaris, M. Moeller +*/ + +#include +#include + +using namespace gismo; + +#ifdef GISMO_WITH_XBRAID + +namespace gismo { + +/** + \brief Derived class implementing the XBraid wrapper for the heat equation +*/ +template +class gsXBraid_app : public gsXBraid +{ + public: + /// Inherit all constructors from base class + using gsXBraid::gsXBraid; + + /// Creates instance from command line argument + static inline gsXBraid_app create(const gsMpiComm& comm, + int argc, + char** argv) + { + // Problem parameters + std::string fn("pde/poisson2d_bvp.xml"); + + // Spatial discretisation parameters + index_t numRefine = 5; + index_t numElevate = 0; + + // Temporal discretisation parameters + index_t numTime = 1; + T tfinal = 1.0; + + // Parallel-in-time multigrid parameters + index_t CFactor = 2; + index_t info = 2; + index_t maxIter = 100; + index_t maxLevel = 30; + index_t minCLevel = 2; + index_t numFMG = 1; + index_t numFMGVcyc = 1; + index_t numMaxRef = 1; + index_t numRelax = 1; + index_t numStorage =-1; + index_t tnorm = 2; // 1-norm, 2-norm, inf-norm + + T absTol = 1e-10; + T relTol = 1e-3; + + bool fmg = false; + bool incrMaxLevels = false; + bool periodic = false; + bool refine = false; + bool sequential = false; + bool skip = true; + + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); + + // Problem parameters + cmd.addString( "f", "file", "Input XML file", fn ); + + // Spatial discretisation parameters + cmd.addInt( "e", "degreeElevation", + "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); + cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); + + // Temporal diescretisation parameters + cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); + cmd.addReal( "t", "time", "Final time", tfinal ); + + // Parallel-in-time multigrid parameters + cmd.addInt( "C", "CFactor", "Coarsening factor of the parallel-in-time multigrid solver", CFactor ); + cmd.addInt( "I", "info", "Print level (no output [=0], =runtime inforation [=1], run statistics [=2(default)], debug [=3])", info ); + cmd.addInt( "M", "maxIter", "Maximum iteration numbers of the parallel-in-time multigrid solver", maxIter ); + cmd.addInt( "L", "maxLevel", "Maximum numbers of parallel-in-time multigrid levels", maxLevel ); + cmd.addInt( "l", "minCLevel", "Minimum level of the parallel-in-time multigrid solver", minCLevel ); + cmd.addInt( "F", "numFMG", "Number of full multigrid steps of the parallel-in-time multigrid solver", numFMG ); + cmd.addInt( "V", "numFMGVcyc", "Number of full multigrid V-cycles of the parallel-in-time multigrid solver", numFMGVcyc ); + cmd.addInt( "R", "numMaxRef", "Maximum number of refinements of the parallel-in-time multigrid solver", numMaxRef ); + cmd.addInt( "X", "numRelax", "Number of relaxation steps of the parallel-in-time multigrid solver", numRelax ); + cmd.addInt( "", "numStorage", "Number of storage of the parallel-in-time multigrid solver", numStorage ); + cmd.addInt( "T", "tnorm", "Temporal norm of the parallel-in-time multigrid solver (1-norm [=1], 2-norm [=2(default)], inf-norm [=3])", tnorm ); + + cmd.addReal( "", "absTol", "Absolute tolerance of the parallel-in-time multigrid solver", absTol ); + cmd.addReal( "", "relTol", "Relative tolerance of the parallel-in-time multigrid solver", relTol ); + + cmd.addSwitch( "fmg" , "Perform full multigrid (default is off)", fmg); + cmd.addSwitch( "incrMaxLevels" , "Increase the maximum number of parallel-in-time multigrid levels after performing a refinement (default is off)", incrMaxLevels); + cmd.addSwitch( "periodic" , "Periodic time grid (default is off)", periodic); + cmd.addSwitch( "refine" , "Perform refinement in time (default off)", refine); + cmd.addSwitch( "sequential", "Set the initial guess of the parallel-in-time multigrid solver as the sequential time stepping solution (default is off)", sequential); + cmd.addSwitch( "skip" , "Skip all work on the first down cycle of the parallel-in-time multigrid solver (default on)", skip); + + cmd.getValues(argc,argv); + + // Create instance + gsXBraid_app app(comm, 0.0, tfinal, numTime); + + app.SetAbsTol(absTol); + app.SetRelTol(relTol); + + app.SetCFactor(CFactor); + app.SetMaxIter(maxIter); + app.SetMaxLevels(maxLevel); + app.SetMaxRefinements(numMaxRef); + app.SetMinCoarse(minCLevel); + app.SetNFMG(numFMG); + app.SetNFMGVcyc(numFMGVcyc); + app.SetNRelax(numRelax); + app.SetPrintLevel(info); + app.SetStorage(numStorage); + app.SetTemporalNorm(tnorm); + + if (fmg) app.SetFMG(); + if (incrMaxLevels) app.SetIncrMaxLevels(); + if (periodic) app.SetPeriodic(1); else app.SetPeriodic(0); + if (refine) app.SetRefine(1); else app.SetRefine(0); + if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); + if (skip) app.SetSkip(1); else app.SetSkip(0); + + return app; + } + + /// Destructor + ~gsXBraid_app() override + {} + + int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &pstatus) override + {} + + int Clone(braid_Vector u, + braid_Vector *v_ptr) override + {} + + int Init(T t, + braid_Vector *u_ptr) override + {} + + int Free(braid_Vector u) override + {} + + int Sum(T alpha, + braid_Vector x, + T beta, + braid_Vector y) override + {} + + int SpatialNorm(braid_Vector u, + T *norm_ptr) override + {} + + int BufSize(index_t *size_ptr, + BraidBufferStatus &status) override + {} + + int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) override + {} + + int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) override + {} + + int Access(braid_Vector u, + BraidAccessStatus &astatus) override + {} + + // Not needed in this example + int Residual(braid_Vector u, + braid_Vector r, + BraidStepStatus &pstatus) override + {} + + // Not needed in this example + int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) override + {} + + // Not needed in this example + int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) override + {} +}; + +} // ending namespace gismo + +#endif + +int main(int argc, char**argv) +{ + // Initialize the MPI environment and obtain the world communicator + gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); + +#ifdef GISMO_WITH_XBRAID + + // Set up app structure + gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); + + // Perform parallel-in-time multigrid + app.solve(); + +#endif + + return 0; + +} diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index ed43585490..95b77a0d64 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -32,7 +32,6 @@ namespace gismo { class gsXBraid : public BraidApp { public: - /// Constructor gsXBraid(const gsMpiComm& comm, const T& tstart, @@ -106,63 +105,143 @@ namespace gismo { void solve() { core.Drive(); } public: + // Sets the maximum number of multigrid levels. void SetMaxLevels(int max_levels) { core.SetMaxLevels(max_levels); } + // Increases the max number of multigrid levels after performing a refinement. void SetIncrMaxLevels() { core.SetIncrMaxLevels(); } - + + // Sets whether to skip all work on the first down cycle (skip = 1). On by default. void SetSkip(int skip) { core.SetSkip(skip); } - + + // Sets the minimum allowed coarse grid size. gsXBraid stops + // coarsening whenever creating the next coarser grid will result + // in a grid smaller than min_coarse. The maximum possible coarse + // grid size will be min_coarse*coarsening_factor. void SetMinCoarse(int min_coarse) { core.SetMinCoarse(min_coarse); } + // Sets the number of relaxation sweeps *nrelax* on grid + // *level*. Level 0 is the finest grid. One sweep is a CF + // relaxation sweep. void SetNRelax(int level, int nrelax) { core.SetNRelax(level, nrelax); } + // Sets the number of relaxation sweeps *nrelax* on all grid + // levels. One sweep is a CF relaxation sweep. + void SetNRelax(int nrelax) { core.SetNRelax(-1, nrelax); } + + // Sets absolute stopping tolerance. void SetAbsTol(T tol) { core.SetAbsTol(tol); } - + + // Sets relative stopping tolerance. void SetRelTol(T tol) { core.SetRelTol(tol); } - + + // Sets the temporal norm: 1-norm (1), 2-norm (2:default), inf-norm (3) void SetTemporalNorm(int tnorm) { core.SetTemporalNorm(tnorm); } - - void SetCFactor(int level, int cfactor) { core.SetCFactor(level, cfactor); } - void SetAggCFactor(int cfactor0) { core.SetAggCFactor(cfactor0); } + // Sets the coarsening factor *cfactor* on grid *level* (default is 2) + void SetCFactor(int level, int cfactor) { core.SetCFactor(level, cfactor); } - void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } + // Sets the coarsening factor *cfactor* on all grid levels + void SetCFactor( int cfactor) { core.SetCFactor(-1, cfactor); } + // Sets periodic time grid (default is 0) void SetPeriodic(int periodic) { core.SetPeriodic(periodic); } - - void SetSync() { core.SetSync(); } - - void SetResidual() { core.SetResidual(); } - + + // Sets max number of multigrid iterations. void SetMaxIter(int max_iter) { core.SetMaxIter(max_iter); } - + + // Sets the print level for runtime print message. + // - Level 0: no output + // - Level 1: print runtime information like the residual history + // - Level 2: level 1 output, plus post-Braid run statistics (default) + // - Level 3: level 2 output, plus debug level output. void SetPrintLevel(int print_level) { core.SetPrintLevel(print_level); } - - void SetSeqSoln(int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } - + + // Sets the output file for runtime print message. void SetPrintFile(const char *printfile_name) { core.SetPrintFile(printfile_name); } + // Sets the initial guess to gsXBraid as the sequential time stepping solution. + // - 0: The user's Init() function initializes the state vector (default) + // - 1: Sequential time stepping, with the user's initial condition from + // Init(t=0) initializes the state vector + void SetSeqSoln(int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } + + // Sets the acces level for gsXBraid. This controls how often the + // user's access routine is called. + // - Level 0: Never call the user's access routine + // - Level 1: Only call the user's access routine after gsXBraid is finished (default) + // - Level 2: Call the user's access routine every iteration and on every level. + // This is during _braid_FRestrict, during the down-cycle part of a + // gsXBraid iteration. void SetAccessLevel(int access_level) { core.SetAccessLevel(access_level); } - + + // Sets FMG (F-cycle) void SetFMG() { core.SetFMG(); } - + + // Sets the number of initial F-cycles to do before switching to V-cycles void SetNFMG(int k) { core.SetNFMG(k); } - + + // Sets the number of V-cycles to do at each FMG level (default is 1) void SetNFMGVcyc(int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } - + + // Sets the storage properties of the code. + // -1 : Default, store only C-points + // 0 : Full storage of C- and F-Points on all levels + // x > 0 : Full storage on all levels >= x void SetStorage(int storage) { core.SetStorage(storage); } - + + // Turns time refinement on (refine = 1) or off (refine = 0). void SetRefine(int refine) {core.SetRefine(refine);} - + + // Sets the max number of time grid refinement levels allowed. void SetMaxRefinements(int max_refinements) {core.SetMaxRefinements(max_refinements);} - + + // Turns on built-in Richardson-based error estimation and/or + // extrapolation with gsXBraid. When enabled, the Richardson + // extrapolation (RE) option (richardson == 1) is used to improve + // the accuracy of the solution at the C-points on the finest + // level. When the built-in error estimate option is turned on + // (est_error == 1), RE is used to estimate the local truncation + // error at each point. These estimates can be accessed through + // StepStatus and AccessStatus functions. The last parameter is + // local_order, which represents the LOCAL order of the* time + // integration scheme. e.g. local_order = 2 for Backward Euler. + // Also, the Richardson error estimate is only available after + // roughly 1 Braid iteration. The estimate is given a dummy value + // of -1.0, until an actual estimate is available. Thus after an + // adaptive refinement, and a new hierarchy is formed, another + // iteration must pass before the error estimates are available + // again. void SetRichardsonEstimation(int est_error, int richardson, int local_order) { core.SetRichardsonEstimation(est_error, richardson, local_order); } + + public: + // Sets user-defined residual routine. + void SetResidual() { core.SetResidual(); } + // Sets user-defined coarsening and refinement routine. + void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } + + // Sets user-defined sync routine. + void SetSync() { core.SetSync(); } + + public: void GetNumIter(int *niter_ptr) { core.GetNumIter(niter_ptr); } void GetRNorms(int *nrequest_ptr, double *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } void GetNLevels(int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + + int iterations() { + int niter; + GetNumIter(&niter); + return niter; + } + + int levels() { + int nlevels; + GetNLevels(&nlevels); + return nlevels; + } protected: /// Braid Core object From fdfd980c9a03a9118d68f61ece306f7b189df564 Mon Sep 17 00:00:00 2001 From: roeltielen Date: Tue, 24 Nov 2020 14:08:03 +0100 Subject: [PATCH 011/174] copy of heatEquation.cpp added (not yet adjusted) --- examples/heatEquation_example 2.cpp | 145 ++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 examples/heatEquation_example 2.cpp diff --git a/examples/heatEquation_example 2.cpp b/examples/heatEquation_example 2.cpp new file mode 100644 index 0000000000..fdd1b0462a --- /dev/null +++ b/examples/heatEquation_example 2.cpp @@ -0,0 +1,145 @@ +/** @file heatEquation_example.cpp + + @brief Solves the heat equation using time-stepping + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): S. Moore, A. Mantzaflaris +*/ + +#include + + +using namespace gismo; + +int main(int argc, char *argv[]) +{ + bool plot = false; + gsCmdLine cmd("Testing the heat equation."); + cmd.addSwitch("plot", "Plot the result in ParaView.", plot); + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } + + // Source function + gsConstantFunction<> f(0,2); + gsInfo<<"Source function is: "<< f << "\n"; + + // Define Geometry, must be a gsMultiPatch object + gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); + patches.computeTopology(); + + // Boundary conditions + gsBoundaryConditions<> bcInfo; + gsConstantFunction<> g_N(1,2); // Neumann + gsConstantFunction<> g_D(0,2); // (Dirichlet + bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); + bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + + gsMultiBasis<> refine_bases( patches ); + // Number for h-refinement of the computational (trial/test) basis. + int numRefine = 2; + + // Number for p-refinement of the computational (trial/test) basis. + int numElevate = 0; + + // Elevate and p-refine the basis to order k + numElevate + // where k is the highest degree in the bases + if ( numElevate > -1 ) + { + // Find maximum degree with respect to all the variables + int tmp = refine_bases.maxDegree(0); + for (short_t j = 1; j < patches.parDim(); ++j ) + if ( tmp < refine_bases.maxDegree(j) ) + tmp = refine_bases.maxDegree(j); + + // Elevate all degrees uniformly + tmp += numElevate; + refine_bases.setDegree(tmp); + } + + // h-refine the basis + for (int i = 0; i < numRefine; ++i) + refine_bases.uniformRefine(); + + // Determines the theta-scheme used for time integration + // (eg. Forward/backward Euler or Crank Nicolson(theta=0.5) + real_t theta = 0.5; + + gsPoissonPde<> pde(patches, bcInfo, f); + // Assembler (constructs stationary matrix and right-hand side vector) + gsPoissonAssembler<> stationary(pde, refine_bases); + stationary.options().setInt("DirichletStrategy", dirichlet::elimination); + stationary.options().setInt("InterfaceStrategy", iFace::glue); + gsHeatEquation assembler(stationary); + assembler.setTheta(theta); + gsInfo<::CGDiagonal solver; + + // Generate system matrix and load vector + gsInfo<<"Assembling mass and stiffness...\n"; + assembler.assemble(); + + gsMatrix<> Sol, Rhs; + int ndof = assembler.numDofs(); + real_t endTime = 0.1; + int numSteps = 40; + Sol.setZero(ndof, 1); // Initial solution + + real_t Dt = endTime / numSteps ; + + const std::string baseName("heat_eq_solution"); + gsParaviewCollection collection(baseName); + + std::string fileName; + + if ( plot ) + { + //sol = assembler.constructSolution(Sol); // same as next line + gsField<> sol = stationary.constructSolution(Sol); + fileName = baseName + "0"; + gsWriteParaview<>(sol, fileName, 1000, true); + collection.addTimestep(fileName,0,"0.vts"); + } + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + { + // Compute the system for the timestep i (rhs is assumed constant wrt time) + assembler.nextTimeStep(Sol, Dt); + gsInfo<<"Solving timestep "<< i*Dt<<".\n"; + + // Solve for current timestep, overwrite previous solution + Sol = solver.compute( assembler.matrix() ).solve( assembler.rhs() ); + + // Obtain current solution as an isogeometric field + //sol = assembler.constructSolution(Sol); // same as next line + gsField<> sol = stationary.constructSolution(Sol); + + if ( plot ) + { + // Plot the snapshot to paraview + fileName = baseName + util::to_string(i); + gsWriteParaview<>(sol, fileName, 1000, true); + collection.addTimestep(fileName,i,"0.vts"); + } + } + + //gsInfo<< " time = "< Date: Wed, 25 Nov 2020 13:36:06 +0100 Subject: [PATCH 012/174] example deleted --- examples/heatEquation_example 2.cpp | 145 ---------------------------- 1 file changed, 145 deletions(-) delete mode 100644 examples/heatEquation_example 2.cpp diff --git a/examples/heatEquation_example 2.cpp b/examples/heatEquation_example 2.cpp deleted file mode 100644 index fdd1b0462a..0000000000 --- a/examples/heatEquation_example 2.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/** @file heatEquation_example.cpp - - @brief Solves the heat equation using time-stepping - - This file is part of the G+Smo library. - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. - - Author(s): S. Moore, A. Mantzaflaris -*/ - -#include - - -using namespace gismo; - -int main(int argc, char *argv[]) -{ - bool plot = false; - gsCmdLine cmd("Testing the heat equation."); - cmd.addSwitch("plot", "Plot the result in ParaView.", plot); - try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } - - // Source function - gsConstantFunction<> f(0,2); - gsInfo<<"Source function is: "<< f << "\n"; - - // Define Geometry, must be a gsMultiPatch object - gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); - patches.computeTopology(); - - // Boundary conditions - gsBoundaryConditions<> bcInfo; - gsConstantFunction<> g_N(1,2); // Neumann - gsConstantFunction<> g_D(0,2); // (Dirichlet - bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); - bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); - - gsMultiBasis<> refine_bases( patches ); - // Number for h-refinement of the computational (trial/test) basis. - int numRefine = 2; - - // Number for p-refinement of the computational (trial/test) basis. - int numElevate = 0; - - // Elevate and p-refine the basis to order k + numElevate - // where k is the highest degree in the bases - if ( numElevate > -1 ) - { - // Find maximum degree with respect to all the variables - int tmp = refine_bases.maxDegree(0); - for (short_t j = 1; j < patches.parDim(); ++j ) - if ( tmp < refine_bases.maxDegree(j) ) - tmp = refine_bases.maxDegree(j); - - // Elevate all degrees uniformly - tmp += numElevate; - refine_bases.setDegree(tmp); - } - - // h-refine the basis - for (int i = 0; i < numRefine; ++i) - refine_bases.uniformRefine(); - - // Determines the theta-scheme used for time integration - // (eg. Forward/backward Euler or Crank Nicolson(theta=0.5) - real_t theta = 0.5; - - gsPoissonPde<> pde(patches, bcInfo, f); - // Assembler (constructs stationary matrix and right-hand side vector) - gsPoissonAssembler<> stationary(pde, refine_bases); - stationary.options().setInt("DirichletStrategy", dirichlet::elimination); - stationary.options().setInt("InterfaceStrategy", iFace::glue); - gsHeatEquation assembler(stationary); - assembler.setTheta(theta); - gsInfo<::CGDiagonal solver; - - // Generate system matrix and load vector - gsInfo<<"Assembling mass and stiffness...\n"; - assembler.assemble(); - - gsMatrix<> Sol, Rhs; - int ndof = assembler.numDofs(); - real_t endTime = 0.1; - int numSteps = 40; - Sol.setZero(ndof, 1); // Initial solution - - real_t Dt = endTime / numSteps ; - - const std::string baseName("heat_eq_solution"); - gsParaviewCollection collection(baseName); - - std::string fileName; - - if ( plot ) - { - //sol = assembler.constructSolution(Sol); // same as next line - gsField<> sol = stationary.constructSolution(Sol); - fileName = baseName + "0"; - gsWriteParaview<>(sol, fileName, 1000, true); - collection.addTimestep(fileName,0,"0.vts"); - } - - for ( int i = 1; i<=numSteps; ++i) // for all timesteps - { - // Compute the system for the timestep i (rhs is assumed constant wrt time) - assembler.nextTimeStep(Sol, Dt); - gsInfo<<"Solving timestep "<< i*Dt<<".\n"; - - // Solve for current timestep, overwrite previous solution - Sol = solver.compute( assembler.matrix() ).solve( assembler.rhs() ); - - // Obtain current solution as an isogeometric field - //sol = assembler.constructSolution(Sol); // same as next line - gsField<> sol = stationary.constructSolution(Sol); - - if ( plot ) - { - // Plot the snapshot to paraview - fileName = baseName + util::to_string(i); - gsWriteParaview<>(sol, fileName, 1000, true); - collection.addTimestep(fileName,i,"0.vts"); - } - } - - //gsInfo<< " time = "< Date: Wed, 25 Nov 2020 13:42:59 +0100 Subject: [PATCH 013/174] Example using expression assembler added --- examples/heatEquation_example2.cpp | 133 +++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 examples/heatEquation_example2.cpp diff --git a/examples/heatEquation_example2.cpp b/examples/heatEquation_example2.cpp new file mode 100644 index 0000000000..d6742dba64 --- /dev/null +++ b/examples/heatEquation_example2.cpp @@ -0,0 +1,133 @@ +/** @file heatEquation_example.cpp + + @brief Solves the heat equation using time-stepping + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): S. Moore, A. Mantzaflaris +*/ + +#include + + +using namespace gismo; + +int main(int argc, char *argv[]) +{ + gsCmdLine cmd("Testing the heat equation."); + // Source function + gsConstantFunction<> f(1,2); + gsInfo<<"Source function is: "<< f << "\n"; + + // Define Geometry, must be a gsMultiPatch object + gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); + patches.computeTopology(); + + // Boundary conditions + gsBoundaryConditions<> bcInfo; + gsConstantFunction<> g_N(1,2); // Neumann + gsConstantFunction<> g_D(0,2); // Dirichlet + bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); + bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + + gsMultiBasis<> refine_bases( patches ); + + // Number for h-refinement of the computational (trial/test) basis. + int numRefine = 2; + + // Number for p-refinement of the computational (trial/test) basis. + int numElevate = 0; + + // Elevate and p-refine the basis to order k + numElevate + // where k is the highest degree in the bases + if ( numElevate > -1 ) + { + // Find maximum degree with respect to all the variables + int tmp = refine_bases.maxDegree(0); + for (short_t j = 1; j < patches.parDim(); ++j ) + if ( tmp < refine_bases.maxDegree(j) ) + tmp = refine_bases.maxDegree(j); + + // Elevate all degrees uniformly + tmp += numElevate; + refine_bases.setDegree(tmp); + } + + // h-refine the basis + for (int i = 0; i < numRefine; ++i) + refine_bases.uniformRefine(); + + // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner + gsSparseSolver<>::CGDiagonal solver; + + real_t theta = 0.0; + gsMatrix<> Sol; + real_t endTime = 0.1; + int numSteps = 40; + + real_t Dt = endTime / numSteps ; + + const std::string baseName("heat_eq_solution"); + gsParaviewCollection collection(baseName); + + std::string fileName; + + // Generate system matrix and load vector + gsInfo<<"Assembling mass and stiffness...\n"; + + gsExprAssembler<> K(1,1); + gsExprAssembler<> M(1,1); + + typedef gsExprAssembler<>::geometryMap geometryMap; + typedef gsExprAssembler<>::variable variable; + typedef gsExprAssembler<>::space space; + typedef gsExprAssembler<>::solution solution; + + K.setIntegrationElements(refine_bases); + M.setIntegrationElements(refine_bases); + gsExprEvaluator<> ev_K(K); + gsExprEvaluator<> ev_M(M); + + // Set the geometry map + geometryMap G_K = K.getMap(patches); + geometryMap G_M = M.getMap(patches); + + // Set the discretization space + space u_K = K.getSpace(refine_bases); + space u_M = M.getSpace(refine_bases); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + u_K.addBc( bcInfo.get("Dirichlet") ); + u_M.addBc( bcInfo.get("Dirichlet") ); + + // Set the source term + variable ff_K = K.getCoeff(f, G_K); + variable ff_M = M.getCoeff(f, G_M); + + K.initSystem(); + M.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + // Enforce Neumann conditions to right-hand side + variable g_Neumann = K.getBdrFunction(); + K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + { + // Compute the system for the timestep i (rhs is assumed constant wrt time) + gsInfo<<"Solving timestep "<< i*Dt<<".\n"; + Sol = solver.compute(M.matrix()+Dt*theta*K.matrix()).solve(Dt*K.rhs()+(M.matrix()-Dt*(1-theta)*K.matrix())*Sol); + } + + gsInfo << "Norm of the solution" << std::endl; + gsInfo << Sol.norm() << std::endl; + + return EXIT_SUCCESS; +} From 18ec8ae8e4c61a5cc77b075cc90db39c0beb965c Mon Sep 17 00:00:00 2001 From: roeltielen Date: Fri, 27 Nov 2020 10:16:41 +0100 Subject: [PATCH 014/174] heatEquation_example2 merged into xbraid_heatEquation_example --- examples/xbraid_heatEquation_example.cpp | 106 ++++++++++++++++++++++- 1 file changed, 103 insertions(+), 3 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index 6425cdef75..6f025916d9 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -39,12 +39,12 @@ class gsXBraid_app : public gsXBraid std::string fn("pde/poisson2d_bvp.xml"); // Spatial discretisation parameters - index_t numRefine = 5; + index_t numRefine = 2; index_t numElevate = 0; // Temporal discretisation parameters - index_t numTime = 1; - T tfinal = 1.0; + index_t numTime = 40; + T tfinal = 0.1; // Parallel-in-time multigrid parameters index_t CFactor = 2; @@ -133,6 +133,106 @@ class gsXBraid_app : public gsXBraid if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); if (skip) app.SetSkip(1); else app.SetSkip(0); + ///////////////////////////////////////////////////////////////////////////////////////////// + // Code for heat equation starts here // + ///////////////////////////////////////////////////////////////////////////////////////////// + + // Source function + gsConstantFunction<> f(1,2); + gsInfo<<"Source function is: "<< f << "\n"; + + // Define Geometry, must be a gsMultiPatch object + gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); + patches.computeTopology(); + + // Boundary conditions + gsBoundaryConditions<> bcInfo; + gsConstantFunction<> g_N(1,2); // Neumann + gsConstantFunction<> g_D(0,2); // Dirichlet + bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); + bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + + gsMultiBasis<> refine_bases( patches ); + + // Elevate and p-refine the basis to order k + numElevate + // where k is the highest degree in the bases + if ( numElevate > -1 ) + { + // Find maximum degree with respect to all the variables + int tmp = refine_bases.maxDegree(0); + for (short_t j = 1; j < patches.parDim(); ++j ) + if ( tmp < refine_bases.maxDegree(j) ) + tmp = refine_bases.maxDegree(j); + + // Elevate all degrees uniformly + tmp += numElevate; + refine_bases.setDegree(tmp); + } + + // h-refine the basis + for (int i = 0; i < numRefine; ++i) + refine_bases.uniformRefine(); + + // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner + gsSparseSolver<>::CGDiagonal solver; + + real_t theta = 0.0; + gsMatrix<> Sol; + real_t Dt = tfinal / numTime ; + + // Generate system matrix and load vector + gsInfo<<"Assembling mass and stiffness...\n"; + + gsExprAssembler<> K(1,1); + gsExprAssembler<> M(1,1); + + typedef gsExprAssembler<>::geometryMap geometryMap; + typedef gsExprAssembler<>::variable variable; + typedef gsExprAssembler<>::space space; + typedef gsExprAssembler<>::solution solution; + + K.setIntegrationElements(refine_bases); + M.setIntegrationElements(refine_bases); + gsExprEvaluator<> ev_K(K); + gsExprEvaluator<> ev_M(M); + + // Set the geometry map + geometryMap G_K = K.getMap(patches); + geometryMap G_M = M.getMap(patches); + + // Set the discretization space + space u_K = K.getSpace(refine_bases); + space u_M = M.getSpace(refine_bases); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + u_K.addBc( bcInfo.get("Dirichlet") ); + u_M.addBc( bcInfo.get("Dirichlet") ); + + // Set the source term + variable ff_K = K.getCoeff(f, G_K); + variable ff_M = M.getCoeff(f, G_M); + + K.initSystem(); + M.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + // Enforce Neumann conditions to right-hand side + variable g_Neumann = K.getBdrFunction(); + K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + + for ( int i = 1; i<=numTime; ++i) // for all timesteps + { + // Compute the system for the timestep i (rhs is assumed constant wrt time) + gsInfo<<"Solving timestep "<< i*Dt<<".\n"; + Sol = solver.compute(M.matrix()+Dt*theta*K.matrix()).solve(Dt*K.rhs()+(M.matrix()-Dt*(1-theta)*K.matrix())*Sol); + } + + gsInfo << "Norm of the solution" << std::endl; + gsInfo << Sol.norm() << std::endl; + return app; } From d4782a17f655d19768d3749e5e63f39ff91c9524 Mon Sep 17 00:00:00 2001 From: roeltielen Date: Wed, 2 Dec 2020 09:57:32 +0100 Subject: [PATCH 015/174] Variables needed in Init,Step etc made private --- examples/xbraid_heatEquation_example.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index 6f025916d9..d8084f3128 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -26,6 +26,15 @@ namespace gismo { template class gsXBraid_app : public gsXBraid { + private: + // Variables, matrices that should be accessible in Step, Init etc. + real_t theta; + real_t Dt; + gsMatrix<> Sol; + gsSparseMatrix<> Stiffness_matrix; + gsSparseMatrix<> Mass_matrix; + gsMatrix<> Rhs; + public: /// Inherit all constructors from base class using gsXBraid::gsXBraid; @@ -223,6 +232,10 @@ class gsXBraid_app : public gsXBraid variable g_Neumann = K.getBdrFunction(); K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + gsSparseMatrix<> Stiffness_matrix = K.matrix(); + gsSparseMatrix<> Mass_matrix = M.matrix(); + gsMatrix<> Rhs = K.rhs(); + for ( int i = 1; i<=numTime; ++i) // for all timesteps { // Compute the system for the timestep i (rhs is assumed constant wrt time) @@ -244,7 +257,10 @@ class gsXBraid_app : public gsXBraid braid_Vector ustop, braid_Vector fstop, BraidStepStatus &pstatus) override - {} + { + gsSparseSolver<>::CGDiagonal solver; + Sol = solver.compute(Mass_matrix+Dt*theta*Stiffness_matrix).solve(Dt*Rhs+(Mass_matrix-Dt*(1-theta)*Stiffness_matrix)*Sol); + } int Clone(braid_Vector u, braid_Vector *v_ptr) override @@ -289,7 +305,9 @@ class gsXBraid_app : public gsXBraid int Residual(braid_Vector u, braid_Vector r, BraidStepStatus &pstatus) override - {} + { + + } // Not needed in this example int Coarsen(braid_Vector fu, From 29bcc70b8db6a1ca1fabed686f045a6a172c0462 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 19 Jan 2021 14:53:08 +0100 Subject: [PATCH 016/174] Complete implementation of gsXBraid extension --- extensions/gsXBraid/gsXBraid.h | 506 +++++++++++++++++++++---------- extensions/gsXBraid/gsXBraid.hpp | 16 +- 2 files changed, 352 insertions(+), 170 deletions(-) diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index 95b77a0d64..511474cab5 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -23,6 +23,12 @@ #include namespace gismo { + + class gsXBraidAccessStatus; + class gsXBraidSyncStatus; + class gsXBraidStepStatus; + class gsXBraidCoarsenRefStatus; + class gsXBraidBufferStatus; /** \brief Class defining the XBraid wrapper @@ -34,211 +40,224 @@ namespace gismo { public: /// Constructor gsXBraid(const gsMpiComm& comm, - const T& tstart, - const T& tstop, - int ntime); + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); /// Destructor virtual ~gsXBraid(); - // Performs one time step - virtual int Step(braid_Vector u, - braid_Vector ustop, - braid_Vector fstop, - BraidStepStatus &pstatus) = 0; - - // Clones the given vectors - virtual int Clone(braid_Vector u, - braid_Vector *v_ptr) = 0 ; - - // Initializes the given vector - virtual int Init(T t, - braid_Vector *u_ptr) = 0; - - // Fianlizes the given vector - virtual int Free(braid_Vector u) = 0; - - // Computes the weighted sum of two given vectors - virtual int Sum(T alpha, - braid_Vector x, - T beta, - braid_Vector y) = 0; - - // Computes the spatial norm of the given vector - virtual int SpatialNorm(braid_Vector u, - T *norm_ptr) = 0; - - // Computes the buffer size - virtual int BufSize(index_t *size_ptr, - BraidBufferStatus &status) = 0; - - // Packes the given vector into the given buffer - virtual int BufPack(braid_Vector u, - void *buffer, - BraidBufferStatus &status) = 0; - - // Unpacks the given buffer into the given vector - virtual int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) = 0; - - // Accesses the given vector - virtual int Access(braid_Vector u, - BraidAccessStatus &astatus) = 0; - - // Calculates the residual - virtual int Residual(braid_Vector u, - braid_Vector r, - BraidStepStatus &pstatus) = 0; + /// Performs one time step + virtual braid_Int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &pstatus) = 0; + + /// Clones the given vectors + virtual braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) = 0 ; + + /// Initializes the given vector + virtual braid_Int Init(braid_Real t, + braid_Vector *u_ptr) = 0; + + /// Fianlizes the given vector + virtual braid_Int Free(braid_Vector u) = 0; + + /// Computes the weighted sum of two given vectors + virtual braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) = 0; + + /// Computes the spatial norm of the given vector + virtual braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) = 0; + + /// Computes the buffer size + virtual braid_Int BufSize(index_t *size_ptr, + BraidBufferStatus &status) = 0; + + /// Packes the given vector into the given buffer + virtual braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) = 0; + + /// Unpacks the given buffer into the given vector + virtual braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) = 0; + + /// Accesses the given vector + virtual braid_Int Access(braid_Vector u, + BraidAccessStatus &astatus) = 0; + + /// Calculates the residual + virtual braid_Int Residual(braid_Vector u, + braid_Vector r, + BraidStepStatus &pstatus) = 0; /// Performs coarsening in time - virtual int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) = 0; + virtual braid_Int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) = 0; /// Performs refinement in time - virtual int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) = 0; - + virtual braid_Int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) = 0; + /// Runs the parallel-in-time multigrid solver void solve() { core.Drive(); } public: - // Sets the maximum number of multigrid levels. - void SetMaxLevels(int max_levels) { core.SetMaxLevels(max_levels); } + /// Sets the maximum number of multigrid levels. + void SetMaxLevels(braid_Int max_levels) { core.SetMaxLevels(max_levels); } - // Increases the max number of multigrid levels after performing a refinement. + /// Increases the max number of multigrid levels after performing a refinement. void SetIncrMaxLevels() { core.SetIncrMaxLevels(); } - // Sets whether to skip all work on the first down cycle (skip = 1). On by default. - void SetSkip(int skip) { core.SetSkip(skip); } + /// Sets whether to skip all work on the first down cycle (skip = 1). On by default. + void SetSkip(braid_Int skip) { core.SetSkip(skip); } - // Sets the minimum allowed coarse grid size. gsXBraid stops - // coarsening whenever creating the next coarser grid will result - // in a grid smaller than min_coarse. The maximum possible coarse - // grid size will be min_coarse*coarsening_factor. - void SetMinCoarse(int min_coarse) { core.SetMinCoarse(min_coarse); } + /// Sets the minimum allowed coarse grid size. gsXBraid stops + /// coarsening whenever creating the next coarser grid will result + /// in a grid smaller than min_coarse. The maximum possible coarse + /// grid size will be min_coarse*coarsening_factor. + void SetMinCoarse(braid_Int min_coarse) { core.SetMinCoarse(min_coarse); } - // Sets the number of relaxation sweeps *nrelax* on grid - // *level*. Level 0 is the finest grid. One sweep is a CF - // relaxation sweep. - void SetNRelax(int level, int nrelax) { core.SetNRelax(level, nrelax); } + /// Sets the number of relaxation sweeps *nrelax* on grid + /// *level*. Level 0 is the finest grid. One sweep is a CF + /// relaxation sweep. + void SetNRelax(braid_Int level, braid_Int nrelax) { core.SetNRelax(level, nrelax); } - // Sets the number of relaxation sweeps *nrelax* on all grid - // levels. One sweep is a CF relaxation sweep. - void SetNRelax(int nrelax) { core.SetNRelax(-1, nrelax); } + /// Sets the number of relaxation sweeps *nrelax* on all grid + /// levels. One sweep is a CF relaxation sweep. + void SetNRelax(braid_Int nrelax) { core.SetNRelax(-1, nrelax); } - // Sets absolute stopping tolerance. - void SetAbsTol(T tol) { core.SetAbsTol(tol); } + /// Sets absolute stopping tolerance. + void SetAbsTol(braid_Real tol) { core.SetAbsTol(tol); } - // Sets relative stopping tolerance. - void SetRelTol(T tol) { core.SetRelTol(tol); } + /// Sets relative stopping tolerance. + void SetRelTol(braid_Real tol) { core.SetRelTol(tol); } - // Sets the temporal norm: 1-norm (1), 2-norm (2:default), inf-norm (3) - void SetTemporalNorm(int tnorm) { core.SetTemporalNorm(tnorm); } + /// Sets the temporal norm: 1-norm (1), 2-norm (2:default), inf-norm (3) + void SetTemporalNorm(braid_Int tnorm) { core.SetTemporalNorm(tnorm); } - // Sets the coarsening factor *cfactor* on grid *level* (default is 2) - void SetCFactor(int level, int cfactor) { core.SetCFactor(level, cfactor); } + /// Sets the coarsening factor *cfactor* on grid *level* (default is 2) + void SetCFactor(braid_Int level, braid_Int cfactor) { core.SetCFactor(level, cfactor); } - // Sets the coarsening factor *cfactor* on all grid levels - void SetCFactor( int cfactor) { core.SetCFactor(-1, cfactor); } + /// Sets the coarsening factor *cfactor* on all grid levels + void SetCFactor(braid_Int cfactor) { core.SetCFactor(-1, cfactor); } - // Sets periodic time grid (default is 0) - void SetPeriodic(int periodic) { core.SetPeriodic(periodic); } + /// Sets periodic time grid (default is 0) + void SetPeriodic(braid_Int periodic) { core.SetPeriodic(periodic); } - // Sets max number of multigrid iterations. - void SetMaxIter(int max_iter) { core.SetMaxIter(max_iter); } + /// Sets max number of multigrid iterations. + void SetMaxIter(braid_Int max_iter) { core.SetMaxIter(max_iter); } - // Sets the print level for runtime print message. - // - Level 0: no output - // - Level 1: print runtime information like the residual history - // - Level 2: level 1 output, plus post-Braid run statistics (default) - // - Level 3: level 2 output, plus debug level output. - void SetPrintLevel(int print_level) { core.SetPrintLevel(print_level); } + /// Sets the print level for runtime print message. + /// - Level 0: no output + /// - Level 1: print runtime information like the residual history + /// - Level 2: level 1 output, plus post-Braid run statistics (default) + /// - Level 3: level 2 output, plus debug level output. + void SetPrintLevel(braid_Int print_level) { core.SetPrintLevel(print_level); } - // Sets the output file for runtime print message. + /// Sets the output file for runtime print message. void SetPrintFile(const char *printfile_name) { core.SetPrintFile(printfile_name); } - // Sets the initial guess to gsXBraid as the sequential time stepping solution. - // - 0: The user's Init() function initializes the state vector (default) - // - 1: Sequential time stepping, with the user's initial condition from - // Init(t=0) initializes the state vector - void SetSeqSoln(int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } - - // Sets the acces level for gsXBraid. This controls how often the - // user's access routine is called. - // - Level 0: Never call the user's access routine - // - Level 1: Only call the user's access routine after gsXBraid is finished (default) - // - Level 2: Call the user's access routine every iteration and on every level. - // This is during _braid_FRestrict, during the down-cycle part of a - // gsXBraid iteration. - void SetAccessLevel(int access_level) { core.SetAccessLevel(access_level); } - - // Sets FMG (F-cycle) + /// Sets the initial guess to gsXBraid as the sequential time stepping solution. + /// - 0: The user's Init() function initializes the state vector (default) + /// - 1: Sequential time stepping, with the user's initial condition from + /// Init(t=0) initializes the state vector + void SetSeqSoln(braid_Int use_seq_soln) { core.SetSeqSoln(use_seq_soln); } + + /// Sets the acces level for gsXBraid. This controls how often the + /// user's access routine is called. + /// - Level 0: Never call the user's access routine + /// - Level 1: Only call the user's access routine after gsXBraid is finished (default) + /// - Level 2: Call the user's access routine every iteration and on every level. + /// This is during _braid_FRestrict, during the down-cycle part of a + /// gsXBraid iteration. + void SetAccessLevel(braid_Int access_level) { core.SetAccessLevel(access_level); } + + /// Sets FMG (F-cycle) void SetFMG() { core.SetFMG(); } - // Sets the number of initial F-cycles to do before switching to V-cycles - void SetNFMG(int k) { core.SetNFMG(k); } - - // Sets the number of V-cycles to do at each FMG level (default is 1) - void SetNFMGVcyc(int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } - - // Sets the storage properties of the code. - // -1 : Default, store only C-points - // 0 : Full storage of C- and F-Points on all levels - // x > 0 : Full storage on all levels >= x - void SetStorage(int storage) { core.SetStorage(storage); } - - // Turns time refinement on (refine = 1) or off (refine = 0). - void SetRefine(int refine) {core.SetRefine(refine);} - - // Sets the max number of time grid refinement levels allowed. - void SetMaxRefinements(int max_refinements) {core.SetMaxRefinements(max_refinements);} - - // Turns on built-in Richardson-based error estimation and/or - // extrapolation with gsXBraid. When enabled, the Richardson - // extrapolation (RE) option (richardson == 1) is used to improve - // the accuracy of the solution at the C-points on the finest - // level. When the built-in error estimate option is turned on - // (est_error == 1), RE is used to estimate the local truncation - // error at each point. These estimates can be accessed through - // StepStatus and AccessStatus functions. The last parameter is - // local_order, which represents the LOCAL order of the* time - // integration scheme. e.g. local_order = 2 for Backward Euler. - // Also, the Richardson error estimate is only available after - // roughly 1 Braid iteration. The estimate is given a dummy value - // of -1.0, until an actual estimate is available. Thus after an - // adaptive refinement, and a new hierarchy is formed, another - // iteration must pass before the error estimates are available - // again. - void SetRichardsonEstimation(int est_error, int richardson, int local_order) { core.SetRichardsonEstimation(est_error, richardson, local_order); } + /// Sets the number of initial F-cycles to do before switching to V-cycles + void SetNFMG(braid_Int k) { core.SetNFMG(k); } + + /// Sets the number of V-cycles to do at each FMG level (default is 1) + void SetNFMGVcyc(braid_Int nfmg_Vcyc) { core.SetNFMGVcyc(nfmg_Vcyc); } + + /// Sets the storage properties of the code. + /// -1 : Default, store only C-points + /// 0 : Full storage of C- and F-Points on all levels + /// x > 0 : Full storage on all levels >= x + void SetStorage(braid_Int storage) { core.SetStorage(storage); } + + /// Turns time refinement on (refine = 1) or off (refine = 0). + void SetRefine(braid_Int refine) {core.SetRefine(refine);} + + /// Sets the max number of time grid refinement levels allowed. + void SetMaxRefinements(braid_Int max_refinements) {core.SetMaxRefinements(max_refinements);} + + /// Turns on built-in Richardson-based error estimation and/or + /// extrapolation with gsXBraid. When enabled, the Richardson + /// extrapolation (RE) option (richardson == 1) is used to improve + /// the accuracy of the solution at the C-points on the finest + /// level. When the built-in error estimate option is turned on + /// (est_error == 1), RE is used to estimate the local truncation + /// error at each point. These estimates can be accessed through + /// StepStatus and AccessStatus functions. The last parameter is + /// local_order, which represents the LOCAL order of the* time + /// integration scheme. e.g. local_order = 2 for Backward Euler. + /// Also, the Richardson error estimate is only available after + /// roughly 1 Braid iteration. The estimate is given a dummy value + /// of -1.0, until an actual estimate is available. Thus after an + /// adaptive refinement, and a new hierarchy is formed, another + /// iteration must pass before the error estimates are available + /// again. + void SetRichardsonEstimation(braid_Int est_error, braid_Int richardson, braid_Int local_order) + { core.SetRichardsonEstimation(est_error, richardson, local_order); } public: - // Sets user-defined residual routine. + /// Sets user-defined residual routine. void SetResidual() { core.SetResidual(); } - // Sets user-defined coarsening and refinement routine. + /// Sets user-defined coarsening and refinement routine. void SetSpatialCoarsenAndRefine() { core.SetSpatialCoarsenAndRefine(); } - // Sets user-defined sync routine. + /// Sets user-defined sync routine. void SetSync() { core.SetSync(); } public: - void GetNumIter(int *niter_ptr) { core.GetNumIter(niter_ptr); } - - void GetRNorms(int *nrequest_ptr, double *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } - - void GetNLevels(int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + /// Gets the number of iterations (XBraid style) + void GetNumIter(braid_Int *niter_ptr) { core.GetNumIter(niter_ptr); } + + /// Gets the residual norm (XBraid style) + void GetRNorms(braid_Int *nrequest_ptr, braid_Real *rnorms) { core.GetRNorms(nrequest_ptr, rnorms); } - int iterations() { - int niter; + /// Gets the total number of levels (XBraid style) + void GetNLevels(braid_Int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + + /// Returns the number of iterations + braid_Int iterations() { + braid_Int niter; GetNumIter(&niter); return niter; } - int levels() { - int nlevels; + /// Returns the residual norm + braid_Real rnorm(braid_Int nrequest) { + braid_Real norm; + GetRNorms(&nrequest, &norm); + return norm; + } + + /// Returns the total number of levels + braid_Int levels() { + braid_Int nlevels; GetNLevels(&nlevels); return nlevels; } @@ -247,6 +266,173 @@ namespace gismo { /// Braid Core object BraidCore core; }; + + /** + \brief Class defining the XBraid access status wrapper + + The wrapper provides all functionality of the BraidAccessStatus + class plus some functions that return the information by value + */ + class gsXBraidAccessStatus : public BraidAccessStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns true if XBraid has completed + bool done() { + braid_Int status; + GetDone(&status); + return bool(status); + } + + /// ??? + braid_Int callingFunction() { + braid_Int callingfcn; + GetCallingFunction(&callingfcn); + return callingfcn; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// ??? + braid_Int test() { + braid_Int wtest; + GetWrapperTest(&wtest); + return wtest; + } + + /// Returns the residual norm + braid_Real norm() { + braid_Real rnorm; + GetResidual(&rnorm); + return rnorm; + } + + /// Returns the estimated error + braid_Real error() { + braid_Real errorest; + GetSingleErrorEstAccess(&errorest); + return errorest; + } + }; + + /** + \brief Class defining the XBraid sync status wrapper + + The wrapper provides all functionality of the BraidSyncStatus + class plus some functions that return the information by value + */ + class gsXBraidSyncStatus : public BraidSyncStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns true if XBraid is completed + bool done() { + braid_Int status; + GetDone(&status); + return bool(status); + } + + /// ??? + braid_Int callingFunction() { + braid_Int callingfcn; + GetCallingFunction(&callingfcn); + return callingfcn; + } + + /// Returns the estimated errors + braid_Real errors() { + braid_Real errorest; + GetAllErrorEst(&errorest); + return errorest; + } + + /// Returns the number of estimated errors + braid_Int nerrors() { + braid_Int numerrorest; + GetNumErrorEst(&numerrorest); + return numerrorest; + } + }; }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp index b7a4369c77..2d0ff0dccf 100644 --- a/extensions/gsXBraid/gsXBraid.hpp +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -20,20 +20,16 @@ namespace gismo { // Constructor template gsXBraid::gsXBraid(const gsMpiComm& comm, - const T& tstart, - const T& tstop, - int ntime) - : BraidApp(static_cast(comm), double(tstart), double(tstop), ntime), + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : BraidApp(static_cast(comm), tstart, tstop, ntime), core(static_cast(comm), this) - { - std::cout << "gsXBraid constructor called\n"; - } + {} // Destructor template gsXBraid::~gsXBraid() - { - std::cout << "gsXBraid destructor called\n"; - } + {} }// namespace gismo From d84d9ab420cebe211d53d3409811eb49e567242c Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 19 Jan 2021 14:53:47 +0100 Subject: [PATCH 017/174] XBraid implementation of heat equation --- examples/xbraid_heatEquation_example.cpp | 461 ++++++++++++++--------- 1 file changed, 289 insertions(+), 172 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index d8084f3128..f5f17e13c6 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -26,19 +26,145 @@ namespace gismo { template class gsXBraid_app : public gsXBraid { - private: - // Variables, matrices that should be accessible in Step, Init etc. - real_t theta; - real_t Dt; - gsMatrix<> Sol; - gsSparseMatrix<> Stiffness_matrix; - gsSparseMatrix<> Mass_matrix; - gsMatrix<> Rhs; +private: + // Spatial discretisation parameters + index_t numRefine, numElevate; + // Temporal discretisation parameters + index_t numTime; + T tstart, tstop, theta, tstep; + + // Spatial discretization + gsMultiPatch patches; + gsMultiBasis bases; + + // Boundary conditions + gsBoundaryConditions bcInfo; + gsConstantFunction g_D, g_N; + + // Expression assembler + gsExprAssembler K, M; + gsConstantFunction f; + + // Solution + gsMatrix sol; + + typedef typename gsExprAssembler::geometryMap geometryMap; + typedef typename gsExprAssembler::variable variable; + typedef typename gsExprAssembler::space space; + typedef typename gsExprAssembler::solution solution; + public: - /// Inherit all constructors from base class - using gsXBraid::gsXBraid; + /// Contructor + gsXBraid_app(const gsMpiComm& comm, + const T& tstart, + const T& tstop, + index_t numTime, + index_t numRefine, + index_t numElevate) + : gsXBraid::gsXBraid(comm, tstart, tstop, (int)numTime), + numRefine(numRefine), + numElevate(numElevate), + numTime(numTime), + tstart(tstart), + tstop(tstop), + theta(0.0), + tstep( (tstop-tstart)/numTime ), + patches(*gsNurbsCreator<>::BSplineSquareDeg(2)), + bases(patches), + g_D(0,2), g_N(1,2), + K(1,1), M(1,1), f(1,2) + { + ///////////////////////////////////////////////////////////////////////////////////////////// + // Code for heat equation starts here // + ///////////////////////////////////////////////////////////////////////////////////////////// + + // Source function + gsInfo << "Source function is: "<< f << "\n"; + + // Define Geometry, must be a gsMultiPatch object + patches.computeTopology(); + + // Boundary conditions + bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); + bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); + bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + + // Elevate and p-refine the basis to order k + numElevate + // where k is the highest degree in the bases + if ( numElevate > -1 ) + { + // Find maximum degree with respect to all the variables + int tmp = bases.maxDegree(0); + for (short_t j = 1; j < patches.parDim(); ++j ) + if ( tmp < bases.maxDegree(j) ) + tmp = bases.maxDegree(j); + + // Elevate all degrees uniformly + tmp += numElevate; + bases.setDegree(tmp); + } + + // h-refine the basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // Generate system matrix and load vector + gsInfo << "Assembling mass and stiffness...\n"; + + // Set the basis + K.setIntegrationElements(bases); + M.setIntegrationElements(bases); + + // Set the geometry map + geometryMap G_K = K.getMap(patches); + geometryMap G_M = M.getMap(patches); + + // Set the discretization space + space u_K = K.getSpace(bases); + space u_M = M.getSpace(bases); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + u_K.addBc( bcInfo.get("Dirichlet") ); + u_M.addBc( bcInfo.get("Dirichlet") ); + + // Set the source term + variable ff_K = K.getCoeff(f, G_K); + variable ff_M = M.getCoeff(f, G_M); + + // Initialize and assemble the system matrix + K.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + + // Initialize and assemble the mass matrix + M.initSystem(); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + // Enforce Neumann conditions to right-hand side + variable g_Neumann = K.getBdrFunction(); + K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + + gsSparseSolver<>::CGDiagonal solver; + sol.setZero(M.numDofs(), 1); + + for ( int i = 1; i<=numTime; ++i) // for all timesteps + { + // Compute the system for the timestep i (rhs is assumed constant wrt time) + gsInfo << "Solving timestep " << i*tstep << ".\n"; + sol = solver.compute(M.matrix() + + tstep*theta*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*(1.0-theta)*K.matrix())*sol); + } + + gsInfo << "Norm of the solution" << std::endl; + gsInfo << sol.norm() << std::endl; + } + /// Destructor + ~gsXBraid_app() {} + /// Creates instance from command line argument static inline gsXBraid_app create(const gsMpiComm& comm, int argc, @@ -50,7 +176,7 @@ class gsXBraid_app : public gsXBraid // Spatial discretisation parameters index_t numRefine = 2; index_t numElevate = 0; - + // Temporal discretisation parameters index_t numTime = 40; T tfinal = 0.1; @@ -118,10 +244,14 @@ class gsXBraid_app : public gsXBraid cmd.getValues(argc,argv); // Create instance - gsXBraid_app app(comm, 0.0, tfinal, numTime); + gsXBraid_app app(comm, 0.0, tfinal, numTime, numRefine, numElevate); - app.SetAbsTol(absTol); - app.SetRelTol(relTol); + if (absTol != 1e-10) + app.SetAbsTol(absTol); + else if (relTol != 1e-3) + app.SetRelTol(relTol); + else + app.SetAbsTol(absTol); app.SetCFactor(CFactor); app.SetMaxIter(maxIter); @@ -141,185 +271,168 @@ class gsXBraid_app : public gsXBraid if (refine) app.SetRefine(1); else app.SetRefine(0); if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); if (skip) app.SetSkip(1); else app.SetSkip(0); - - ///////////////////////////////////////////////////////////////////////////////////////////// - // Code for heat equation starts here // - ///////////////////////////////////////////////////////////////////////////////////////////// - - // Source function - gsConstantFunction<> f(1,2); - gsInfo<<"Source function is: "<< f << "\n"; - - // Define Geometry, must be a gsMultiPatch object - gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); - patches.computeTopology(); - - // Boundary conditions - gsBoundaryConditions<> bcInfo; - gsConstantFunction<> g_N(1,2); // Neumann - gsConstantFunction<> g_D(0,2); // Dirichlet - bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); - bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); - - gsMultiBasis<> refine_bases( patches ); - - // Elevate and p-refine the basis to order k + numElevate - // where k is the highest degree in the bases - if ( numElevate > -1 ) - { - // Find maximum degree with respect to all the variables - int tmp = refine_bases.maxDegree(0); - for (short_t j = 1; j < patches.parDim(); ++j ) - if ( tmp < refine_bases.maxDegree(j) ) - tmp = refine_bases.maxDegree(j); - - // Elevate all degrees uniformly - tmp += numElevate; - refine_bases.setDegree(tmp); - } - - // h-refine the basis - for (int i = 0; i < numRefine; ++i) - refine_bases.uniformRefine(); - - // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner - gsSparseSolver<>::CGDiagonal solver; - - real_t theta = 0.0; - gsMatrix<> Sol; - real_t Dt = tfinal / numTime ; - - // Generate system matrix and load vector - gsInfo<<"Assembling mass and stiffness...\n"; - gsExprAssembler<> K(1,1); - gsExprAssembler<> M(1,1); + return app; + } + + /// Performs a single step of the parallel-in-time multigrid + braid_Int Step(braid_Vector u, + braid_Vector ustop, + braid_Vector fstop, + BraidStepStatus &pstatus) override + { + gsMatrix* _u = (gsMatrix*) u; + T tstart, tstop; - typedef gsExprAssembler<>::geometryMap geometryMap; - typedef gsExprAssembler<>::variable variable; - typedef gsExprAssembler<>::space space; - typedef gsExprAssembler<>::solution solution; - - K.setIntegrationElements(refine_bases); - M.setIntegrationElements(refine_bases); - gsExprEvaluator<> ev_K(K); - gsExprEvaluator<> ev_M(M); - - // Set the geometry map - geometryMap G_K = K.getMap(patches); - geometryMap G_M = M.getMap(patches); - - // Set the discretization space - space u_K = K.getSpace(refine_bases); - space u_M = M.getSpace(refine_bases); - u_K.setInterfaceCont(0); - u_M.setInterfaceCont(0); - u_K.addBc( bcInfo.get("Dirichlet") ); - u_M.addBc( bcInfo.get("Dirichlet") ); + // Get time step information + pstatus.GetTstartTstop(&tstart, &tstop); + T tstep(tstop - tstart); - // Set the source term - variable ff_K = K.getCoeff(f, G_K); - variable ff_M = M.getCoeff(f, G_M); + // Solve spatial problem + gsSparseSolver<>::CGDiagonal solver; + *_u = solver.compute(M.matrix() + + tstep*theta*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*_u)); + // no refinement + pstatus.SetRFactor(1); + return braid_Int(0); + } - K.initSystem(); - M.initSystem(); - K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); - M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + /// Clones a given vector + braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) override + { + gsMatrix* _u = (gsMatrix*) u; + gsMatrix* v = new gsMatrix(); + (*v) = (*_u); + *v_ptr = (braid_Vector) v; + return braid_Int(0); + } - // Enforce Neumann conditions to right-hand side - variable g_Neumann = K.getBdrFunction(); - K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + /// Initializes a vector + braid_Int Init(braid_Real t, + braid_Vector *u_ptr) override + { + gsMatrix* u = new gsMatrix(M.numDofs(), 1); + + if (t != tstart) { + // Intermediate solution + u->setZero(M.numDofs(), 1); + } else { + // Initial solution + u->setZero(M.numDofs(), 1); + } - gsSparseMatrix<> Stiffness_matrix = K.matrix(); - gsSparseMatrix<> Mass_matrix = M.matrix(); - gsMatrix<> Rhs = K.rhs(); + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } - for ( int i = 1; i<=numTime; ++i) // for all timesteps - { - // Compute the system for the timestep i (rhs is assumed constant wrt time) - gsInfo<<"Solving timestep "<< i*Dt<<".\n"; - Sol = solver.compute(M.matrix()+Dt*theta*K.matrix()).solve(Dt*K.rhs()+(M.matrix()-Dt*(1-theta)*K.matrix())*Sol); - } + /// Frees a given vector + braid_Int Free(braid_Vector u) override + { + gsMatrix* _u = (gsMatrix*) u; + delete _u; + return braid_Int(0); + } - gsInfo << "Norm of the solution" << std::endl; - gsInfo << Sol.norm() << std::endl; - - return app; + /// Computes the sum of two given vectors + braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) override + { + gsMatrix* _x = (gsMatrix*) x; + gsMatrix* _y = (gsMatrix*) y; + *_y = (T)alpha * (*_x) + (T)beta * (*_y); + return braid_Int(0); } - - /// Destructor - ~gsXBraid_app() override - {} - int Step(braid_Vector u, - braid_Vector ustop, - braid_Vector fstop, - BraidStepStatus &pstatus) override + /// Computes the spatial norm of a given vector + braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) override { - gsSparseSolver<>::CGDiagonal solver; - Sol = solver.compute(Mass_matrix+Dt*theta*Stiffness_matrix).solve(Dt*Rhs+(Mass_matrix-Dt*(1-theta)*Stiffness_matrix)*Sol); + gsMatrix *_u = (gsMatrix*) u; + *norm_ptr = _u->norm(); + return braid_Int(0); } - int Clone(braid_Vector u, - braid_Vector *v_ptr) override - {} - - int Init(T t, - braid_Vector *u_ptr) override - {} - - int Free(braid_Vector u) override - {} - - int Sum(T alpha, - braid_Vector x, - T beta, - braid_Vector y) override - {} - - int SpatialNorm(braid_Vector u, - T *norm_ptr) override - {} - - int BufSize(index_t *size_ptr, - BraidBufferStatus &status) override - {} + braid_Int BufSize(braid_Int *size_ptr, + BraidBufferStatus &status) override + { + *size_ptr = sizeof(T)*(M.numDofs()+1); + return braid_Int(0); + } - int BufPack(braid_Vector u, - void *buffer, - BraidBufferStatus &status) override - {} + braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) override + { + gsMatrix *_u = (gsMatrix*) u; + T* _buffer = (T*) buffer; + T* _data = _u->data(); + index_t size = _u->rows()*_u->cols(); + + _buffer[0] = size; + for (index_t idx = 0; idx < size; ++idx) + _buffer[idx+1] = _data[idx]; + + status.SetSize(sizeof(T)*(size+1)); + return braid_Int(0); + } - int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) override - {} + braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) override + { + T* _buffer = (T*) buffer; + index_t size = _buffer[0]; + gsMatrix* u = new gsMatrix(size,1); + T* _data = u->data(); + + for (index_t idx = 0; idx < size; ++idx) + _data[idx] = _buffer[idx+1]; + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } - int Access(braid_Vector u, - BraidAccessStatus &astatus) override - {} + braid_Int Access(braid_Vector u, + BraidAccessStatus &astatus) override + { + if(static_cast(astatus).done() && + static_cast(astatus).timeIndex() == + static_cast(astatus).times()) { + gsMatrix* _u = (gsMatrix*) u; + gsInfo << "Norm of the solution" << std::endl; + gsInfo << _u->norm() << std::endl; + } + return braid_Int(0); + } // Not needed in this example - int Residual(braid_Vector u, - braid_Vector r, - BraidStepStatus &pstatus) override + braid_Int Residual(braid_Vector u, + braid_Vector r, + BraidStepStatus &pstatus) override { - + return braid_Int(0); } // Not needed in this example - int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) override - {} + braid_Int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) override + { + return braid_Int(0); + } // Not needed in this example - int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) override - {} + braid_Int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) override + { + return braid_Int(0); + } }; } // ending namespace gismo @@ -328,17 +441,21 @@ class gsXBraid_app : public gsXBraid int main(int argc, char**argv) { +#ifdef GISMO_WITH_XBRAID + // Initialize the MPI environment and obtain the world communicator gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); - -#ifdef GISMO_WITH_XBRAID // Set up app structure gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); // Perform parallel-in-time multigrid app.solve(); - + +#else + + gsInfo << "\n"; + #endif return 0; From c4558f29deddbdc78de01cfdc844fa54ad4748c1 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 19 Jan 2021 14:54:15 +0100 Subject: [PATCH 018/174] Small improvements of sequantial heat equation --- examples/heatEquation_example2.cpp | 46 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/heatEquation_example2.cpp b/examples/heatEquation_example2.cpp index d6742dba64..9044c8e41b 100644 --- a/examples/heatEquation_example2.cpp +++ b/examples/heatEquation_example2.cpp @@ -18,10 +18,11 @@ using namespace gismo; int main(int argc, char *argv[]) { - gsCmdLine cmd("Testing the heat equation."); + gsCmdLine cmd("Testing the heat equation."); + // Source function gsConstantFunction<> f(1,2); - gsInfo<<"Source function is: "<< f << "\n"; + gsInfo << "Source function is: " << f << "\n"; // Define Geometry, must be a gsMultiPatch object gsMultiPatch<> patches(*gsNurbsCreator<>::BSplineSquareDeg(2)); @@ -36,7 +37,7 @@ int main(int argc, char *argv[]) bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); - gsMultiBasis<> refine_bases( patches ); + gsMultiBasis<> bases( patches ); // Number for h-refinement of the computational (trial/test) basis. int numRefine = 2; @@ -49,25 +50,21 @@ int main(int argc, char *argv[]) if ( numElevate > -1 ) { // Find maximum degree with respect to all the variables - int tmp = refine_bases.maxDegree(0); + int tmp = bases.maxDegree(0); for (short_t j = 1; j < patches.parDim(); ++j ) - if ( tmp < refine_bases.maxDegree(j) ) - tmp = refine_bases.maxDegree(j); + if ( tmp < bases.maxDegree(j) ) + tmp = bases.maxDegree(j); // Elevate all degrees uniformly tmp += numElevate; - refine_bases.setDegree(tmp); + bases.setDegree(tmp); } // h-refine the basis for (int i = 0; i < numRefine; ++i) - refine_bases.uniformRefine(); - - // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner - gsSparseSolver<>::CGDiagonal solver; + bases.uniformRefine(); real_t theta = 0.0; - gsMatrix<> Sol; real_t endTime = 0.1; int numSteps = 40; @@ -76,10 +73,8 @@ int main(int argc, char *argv[]) const std::string baseName("heat_eq_solution"); gsParaviewCollection collection(baseName); - std::string fileName; - // Generate system matrix and load vector - gsInfo<<"Assembling mass and stiffness...\n"; + gsInfo << "Assembling mass and stiffness...\n"; gsExprAssembler<> K(1,1); gsExprAssembler<> M(1,1); @@ -89,18 +84,16 @@ int main(int argc, char *argv[]) typedef gsExprAssembler<>::space space; typedef gsExprAssembler<>::solution solution; - K.setIntegrationElements(refine_bases); - M.setIntegrationElements(refine_bases); - gsExprEvaluator<> ev_K(K); - gsExprEvaluator<> ev_M(M); + K.setIntegrationElements(bases); + M.setIntegrationElements(bases); // Set the geometry map geometryMap G_K = K.getMap(patches); geometryMap G_M = M.getMap(patches); // Set the discretization space - space u_K = K.getSpace(refine_bases); - space u_M = M.getSpace(refine_bases); + space u_K = K.getSpace(bases); + space u_M = M.getSpace(bases); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); u_K.addBc( bcInfo.get("Dirichlet") ); @@ -118,12 +111,19 @@ int main(int argc, char *argv[]) // Enforce Neumann conditions to right-hand side variable g_Neumann = K.getBdrFunction(); K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + + // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner + gsSparseSolver<>::CGDiagonal solver; + gsMatrix<> Sol(M.numDofs(), 1); for ( int i = 1; i<=numSteps; ++i) // for all timesteps { // Compute the system for the timestep i (rhs is assumed constant wrt time) - gsInfo<<"Solving timestep "<< i*Dt<<".\n"; - Sol = solver.compute(M.matrix()+Dt*theta*K.matrix()).solve(Dt*K.rhs()+(M.matrix()-Dt*(1-theta)*K.matrix())*Sol); + gsInfo << "Solving timestep " << i*Dt << ".\n"; + Sol = solver.compute(M.matrix() + + Dt*theta*K.matrix() + ).solve(Dt*K.rhs() + + (M.matrix()-Dt*(1.0-theta)*K.matrix())*Sol); } gsInfo << "Norm of the solution" << std::endl; From 55f519d41222ab784ba3f62baff5c20cf7cf2be6 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 20 Jan 2021 12:55:55 +0100 Subject: [PATCH 019/174] Improvements of gsXBraid extension --- extensions/gsXBraid/CMakeLists.txt | 2 +- extensions/gsXBraid/gsXBraid.h | 304 +++++++++++++++++++++++------ extensions/gsXBraid/gsXBraid.hpp | 28 +++ extensions/gsXBraid/gsXBraid_.cpp | 4 +- 4 files changed, 271 insertions(+), 67 deletions(-) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 57244598bc..258810e2d0 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -25,7 +25,7 @@ find_package(XBRAID QUIET) if (NOT XBRAID_FOUND) # Set XBraid version - set(XBRAID_VER "v3.0.0") + set(XBRAID_VER "master") # Download XBraid sources at configure time include(gsFetch) diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index 511474cab5..f8bf85afc4 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -13,8 +13,7 @@ #pragma once -#include -#include +#include #if !defined(GISMO_WITH_MPI) #define braid_SEQUENTIAL 1 @@ -29,9 +28,47 @@ namespace gismo { class gsXBraidStepStatus; class gsXBraidCoarsenRefStatus; class gsXBraidBufferStatus; + class gsXBraidObjectiveStatus; /** \brief Class defining the XBraid wrapper + + The gsXBraid class wraps the BraidApp class provided by the + XBraid project and adds a set of commodity functions. + + In order to implement an XBraid application the user has to + implement a derived class + + \code{.cpp} + template + class gsXBraid_app : public gsXBraid + { ... }; + \endcode + + and implement the following application-specific functions: + + \code{.cpp} + braid_Int Access(...) + braid_Int BufPack(...) + braid_Int BufSize(...) + braid_Int BufUnpack(...) + braid_Int Clone(...) + braid_Int Free(...) + braid_Int Init(...) + braid_Int Residual(...) + braid_Int SpatialNorm(...) + braid_Int Step(...) + \endcode + + which are declared as (pure) virtual functions in BraidApp. + + The generic implementation of the gsXBraid class leaves all of + these methods unimplemented. We also provide specializations for + gsXBraid< gsMatrix > and gsXBraid< std::vector< gsMatrix > + > which assume that the data type for storing the solution + (passed as braid_Vector) is of type gsMatrix and std::vector< + gsMatrix >, respectively. The latter can be used to pass a + hierarchy of matrices/vectors in a multi-level setup. */ template @@ -47,65 +84,8 @@ namespace gismo { /// Destructor virtual ~gsXBraid(); - /// Performs one time step - virtual braid_Int Step(braid_Vector u, - braid_Vector ustop, - braid_Vector fstop, - BraidStepStatus &pstatus) = 0; - - /// Clones the given vectors - virtual braid_Int Clone(braid_Vector u, - braid_Vector *v_ptr) = 0 ; - - /// Initializes the given vector - virtual braid_Int Init(braid_Real t, - braid_Vector *u_ptr) = 0; - - /// Fianlizes the given vector - virtual braid_Int Free(braid_Vector u) = 0; - - /// Computes the weighted sum of two given vectors - virtual braid_Int Sum(braid_Real alpha, - braid_Vector x, - braid_Real beta, - braid_Vector y) = 0; - - /// Computes the spatial norm of the given vector - virtual braid_Int SpatialNorm(braid_Vector u, - braid_Real *norm_ptr) = 0; - - /// Computes the buffer size - virtual braid_Int BufSize(index_t *size_ptr, - BraidBufferStatus &status) = 0; - - /// Packes the given vector into the given buffer - virtual braid_Int BufPack(braid_Vector u, - void *buffer, - BraidBufferStatus &status) = 0; - - /// Unpacks the given buffer into the given vector - virtual braid_Int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) = 0; - - /// Accesses the given vector - virtual braid_Int Access(braid_Vector u, - BraidAccessStatus &astatus) = 0; - - /// Calculates the residual - virtual braid_Int Residual(braid_Vector u, - braid_Vector r, - BraidStepStatus &pstatus) = 0; - - /// Performs coarsening in time - virtual braid_Int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) = 0; - - /// Performs refinement in time - virtual braid_Int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) = 0; + /// Free + virtual braid_Int Free(braid_Vector u) { return braid_Int(0); } /// Runs the parallel-in-time multigrid solver void solve() { core.Drive(); } @@ -249,10 +229,10 @@ namespace gismo { } /// Returns the residual norm - braid_Real rnorm(braid_Int nrequest) { - braid_Real norm; - GetRNorms(&nrequest, &norm); - return norm; + braid_Real norm(braid_Int nrequest) { + braid_Real rnorm; + GetRNorms(&nrequest, &rnorm); + return rnorm; } /// Returns the total number of levels @@ -267,6 +247,200 @@ namespace gismo { BraidCore core; }; + + /** + \brief Specializations for gsXBraid< gsMatrix > + */ + template + class gsXBraid< gsMatrix > : public gsXBraid + { + public: + /// Constructor + gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); + + /// Destructor + virtual ~gsXBraid(); + + /// Clones a given vector + virtual braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) + { + gsMatrix* _u = (gsMatrix*) u; + gsMatrix* v = new gsMatrix(); + (*v) = (*_u); + *v_ptr = (braid_Vector) v; + return braid_Int(0); + } + + /// Frees a given vector + virtual braid_Int Free(braid_Vector u) + { + gsMatrix* _u = (gsMatrix*) u; + delete _u; + return braid_Int(0); + } + + /// Computes the sum of two given vectors + virtual braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) + { + gsMatrix* _x = (gsMatrix*) x; + gsMatrix* _y = (gsMatrix*) y; + *_y = (T)alpha * (*_x) + (T)beta * (*_y); + return braid_Int(0); + } + + /// Computes the spatial norm of a given vector + virtual braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) + { + gsMatrix *_u = (gsMatrix*) u; + *norm_ptr = _u->norm(); + return braid_Int(0); + } + + /// Packs the given vector into the MPI communication buffer + virtual braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) + { + gsMatrix *_u = (gsMatrix*) u; + T* _buffer = (T*) buffer; + T* _data = _u->data(); + index_t size = _u->rows()*_u->cols(); + + _buffer[0] = _u->rows(); + _buffer[1] = _u->cols(); + for (index_t idx = 0; idx < size; ++idx) + _buffer[idx+2] = _data[idx]; + + status.SetSize(sizeof(T)*(size+2)); + return braid_Int(0); + } + + /// Unpacks a vector from the MPI communication buffer + virtual braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) + { + T* _buffer = (T*) buffer; + index_t rows = _buffer[0]; + index_t cols = _buffer[1]; + gsMatrix* u = new gsMatrix(rows,cols); + T* _data = u->data(); + + for (index_t idx = 0; idx < rows*cols; ++idx) + _data[idx] = _buffer[idx+2]; + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } + }; + + + /** + \brief Specializations for gsXBraid< std::vector< gsMatrix > > + */ + template + class gsXBraid< std::vector< gsMatrix > > : public gsXBraid + { + public: + /// Constructor + gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime); + + /// Destructor + virtual ~gsXBraid(); + + /// Clones a given vector + virtual braid_Int Clone(braid_Vector u, + braid_Vector *v_ptr) + { + std::vector< gsMatrix >* _u = (std::vector< gsMatrix >*) u; + std::vector< gsMatrix >* v = new std::vector< gsMatrix >(); + + for (typename std::vector< gsMatrix >::const_iterator it = _u->cbegin(); + it != _u->cend(); ++it) + v->push_back( *it ); + *v_ptr = (braid_Vector) v; + return braid_Int(0); + } + + /// Frees a given vector + virtual braid_Int Free(braid_Vector u) + { + gsMatrix* _u = (gsMatrix*) u; + delete _u; + return braid_Int(0); + } + + /// Computes the sum of two given vectors + virtual braid_Int Sum(braid_Real alpha, + braid_Vector x, + braid_Real beta, + braid_Vector y) + { + gsMatrix* _x = (gsMatrix*) x; + gsMatrix* _y = (gsMatrix*) y; + *_y = (T)alpha * (*_x) + (T)beta * (*_y); + return braid_Int(0); + } + + /// Computes the spatial norm of a given vector + virtual braid_Int SpatialNorm(braid_Vector u, + braid_Real *norm_ptr) + { + gsMatrix *_u = (gsMatrix*) u; + *norm_ptr = _u->norm(); + return braid_Int(0); + } + + /// Packs the given vector into the MPI communication buffer + virtual braid_Int BufPack(braid_Vector u, + void *buffer, + BraidBufferStatus &status) + { + gsMatrix *_u = (gsMatrix*) u; + T* _buffer = (T*) buffer; + T* _data = _u->data(); + index_t size = _u->rows()*_u->cols(); + + _buffer[0] = _u->rows(); + _buffer[1] = _u->cols(); + for (index_t idx = 0; idx < size; ++idx) + _buffer[idx+2] = _data[idx]; + + status.SetSize(sizeof(T)*(size+2)); + return braid_Int(0); + } + + /// Unpacks a vector from the MPI communication buffer + virtual braid_Int BufUnpack(void *buffer, + braid_Vector *u_ptr, + BraidBufferStatus &status) + { + T* _buffer = (T*) buffer; + index_t rows = _buffer[0]; + index_t cols = _buffer[1]; + gsMatrix* u = new gsMatrix(rows,cols); + T* _data = u->data(); + + for (index_t idx = 0; idx < rows*cols; ++idx) + _data[idx] = _buffer[idx+2]; + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } + }; + + /** \brief Class defining the XBraid access status wrapper diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp index 2d0ff0dccf..f8e91a2fe0 100644 --- a/extensions/gsXBraid/gsXBraid.hpp +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -31,5 +31,33 @@ namespace gismo { template gsXBraid::~gsXBraid() {} + + // Constructor + template + gsXBraid< gsMatrix >::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : gsXBraid(comm, tstart, tstop, ntime) + {} + + // Destructor + template + gsXBraid< gsMatrix >::~gsXBraid() + {} + + // Constructor + template + gsXBraid< std::vector< gsMatrix > >::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) + : gsXBraid(comm, tstart, tstop, ntime) + {} + + // Destructor + template + gsXBraid< std::vector< gsMatrix > >::~gsXBraid() + {} }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp index 06a4edfd56..9df8a35329 100644 --- a/extensions/gsXBraid/gsXBraid_.cpp +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -6,6 +6,8 @@ namespace gismo { -CLASS_TEMPLATE_INST gsXBraid; + CLASS_TEMPLATE_INST gsXBraid; + CLASS_TEMPLATE_INST gsXBraid< gsMatrix >; + CLASS_TEMPLATE_INST gsXBraid< std::vector< gsMatrix > >; } From df032d6435590cb1536e2d0faff852a463acdddc Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 20 Jan 2021 12:56:12 +0100 Subject: [PATCH 020/174] Improvements of XBraid application --- examples/xbraid_heatEquation_example.cpp | 141 +++++------------------ 1 file changed, 27 insertions(+), 114 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index f5f17e13c6..cb84c46f70 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -24,7 +24,7 @@ namespace gismo { \brief Derived class implementing the XBraid wrapper for the heat equation */ template -class gsXBraid_app : public gsXBraid +class gsXBraid_app : public gsXBraid > { private: // Spatial discretisation parameters @@ -62,7 +62,7 @@ class gsXBraid_app : public gsXBraid index_t numTime, index_t numRefine, index_t numElevate) - : gsXBraid::gsXBraid(comm, tstart, tstop, (int)numTime), + : gsXBraid >::gsXBraid(comm, tstart, tstop, (int)numTime), numRefine(numRefine), numElevate(numElevate), numTime(numTime), @@ -163,7 +163,7 @@ class gsXBraid_app : public gsXBraid } /// Destructor - ~gsXBraid_app() {} + virtual ~gsXBraid_app() {} /// Creates instance from command line argument static inline gsXBraid_app create(const gsMpiComm& comm, @@ -274,12 +274,30 @@ class gsXBraid_app : public gsXBraid return app; } + + /// Initializes a vector + braid_Int Init(braid_Real t, + braid_Vector *u_ptr) + { + gsMatrix* u = new gsMatrix(M.numDofs(), 1); + + if (t != tstart) { + // Intermediate solution + u->setZero(M.numDofs(), 1); + } else { + // Initial solution + u->setZero(M.numDofs(), 1); + } + + *u_ptr = (braid_Vector) u; + return braid_Int(0); + } /// Performs a single step of the parallel-in-time multigrid braid_Int Step(braid_Vector u, braid_Vector ustop, braid_Vector fstop, - BraidStepStatus &pstatus) override + BraidStepStatus &pstatus) { gsMatrix* _u = (gsMatrix*) u; T tstart, tstop; @@ -299,106 +317,17 @@ class gsXBraid_app : public gsXBraid return braid_Int(0); } - /// Clones a given vector - braid_Int Clone(braid_Vector u, - braid_Vector *v_ptr) override - { - gsMatrix* _u = (gsMatrix*) u; - gsMatrix* v = new gsMatrix(); - (*v) = (*_u); - *v_ptr = (braid_Vector) v; - return braid_Int(0); - } - - /// Initializes a vector - braid_Int Init(braid_Real t, - braid_Vector *u_ptr) override - { - gsMatrix* u = new gsMatrix(M.numDofs(), 1); - - if (t != tstart) { - // Intermediate solution - u->setZero(M.numDofs(), 1); - } else { - // Initial solution - u->setZero(M.numDofs(), 1); - } - - *u_ptr = (braid_Vector) u; - return braid_Int(0); - } - - /// Frees a given vector - braid_Int Free(braid_Vector u) override - { - gsMatrix* _u = (gsMatrix*) u; - delete _u; - return braid_Int(0); - } - - /// Computes the sum of two given vectors - braid_Int Sum(braid_Real alpha, - braid_Vector x, - braid_Real beta, - braid_Vector y) override - { - gsMatrix* _x = (gsMatrix*) x; - gsMatrix* _y = (gsMatrix*) y; - *_y = (T)alpha * (*_x) + (T)beta * (*_y); - return braid_Int(0); - } - - /// Computes the spatial norm of a given vector - braid_Int SpatialNorm(braid_Vector u, - braid_Real *norm_ptr) override - { - gsMatrix *_u = (gsMatrix*) u; - *norm_ptr = _u->norm(); - return braid_Int(0); - } - + /// Sets the size of the MPI communication buffer braid_Int BufSize(braid_Int *size_ptr, - BraidBufferStatus &status) override + BraidBufferStatus &status) { *size_ptr = sizeof(T)*(M.numDofs()+1); return braid_Int(0); } - - braid_Int BufPack(braid_Vector u, - void *buffer, - BraidBufferStatus &status) override - { - gsMatrix *_u = (gsMatrix*) u; - T* _buffer = (T*) buffer; - T* _data = _u->data(); - index_t size = _u->rows()*_u->cols(); - - _buffer[0] = size; - for (index_t idx = 0; idx < size; ++idx) - _buffer[idx+1] = _data[idx]; - - status.SetSize(sizeof(T)*(size+1)); - return braid_Int(0); - } - - braid_Int BufUnpack(void *buffer, - braid_Vector *u_ptr, - BraidBufferStatus &status) override - { - T* _buffer = (T*) buffer; - index_t size = _buffer[0]; - gsMatrix* u = new gsMatrix(size,1); - T* _data = u->data(); - for (index_t idx = 0; idx < size; ++idx) - _data[idx] = _buffer[idx+1]; - - *u_ptr = (braid_Vector) u; - return braid_Int(0); - } - + /// Handles braid_Int Access(braid_Vector u, - BraidAccessStatus &astatus) override + BraidAccessStatus &astatus) { if(static_cast(astatus).done() && static_cast(astatus).timeIndex() == @@ -413,23 +342,7 @@ class gsXBraid_app : public gsXBraid // Not needed in this example braid_Int Residual(braid_Vector u, braid_Vector r, - BraidStepStatus &pstatus) override - { - return braid_Int(0); - } - - // Not needed in this example - braid_Int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) override - { - return braid_Int(0); - } - - // Not needed in this example - braid_Int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) override + BraidStepStatus &pstatus) { return braid_Int(0); } From 6998f2b2fffc0eb29a2b1097bc6637ecac74c6ea Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 20 Jan 2021 20:08:50 +0100 Subject: [PATCH 021/174] Improved gsXBraid extension --- extensions/gsXBraid/gsXBraid.h | 417 +++++++++++++++++++++++++----- extensions/gsXBraid/gsXBraid.hpp | 16 +- extensions/gsXBraid/gsXBraid_.cpp | 2 +- 3 files changed, 359 insertions(+), 76 deletions(-) diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index f8bf85afc4..271fc705bf 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -64,11 +64,9 @@ namespace gismo { The generic implementation of the gsXBraid class leaves all of these methods unimplemented. We also provide specializations for - gsXBraid< gsMatrix > and gsXBraid< std::vector< gsMatrix > - > which assume that the data type for storing the solution - (passed as braid_Vector) is of type gsMatrix and std::vector< - gsMatrix >, respectively. The latter can be used to pass a - hierarchy of matrices/vectors in a multi-level setup. + gsXBraid> and gsXBraid> which assume that + the data type for storing the solution (passed as braid_Vector) + is of type gsMatrix and gsVector, respectively. */ template @@ -210,6 +208,24 @@ namespace gismo { /// Sets user-defined sync routine. void SetSync() { core.SetSync(); } + + /// Sets the default print file + void SetDefaultPrintFile() { core.SetDefaultPrintFile(); } + + /// Sets the file input/output level + void SetFileIOLevel(braid_Int io_level) { core.SetFileIOLevel(io_level); } + + /// Sets the coarse-relaxation weight + void SetCRelaxWt(braid_Int level, braid_Real Cwt) { core.SetCRelaxWt(level, Cwt); } + + /// Sets the time cutoff + void SetTPointsCutoff(braid_Int tpoints_cutoff) { core.SetTPointsCutoff(tpoints_cutoff); } + + /// Sets callback function for residual numer calculation + void SetFullRNormRes(braid_PtFcnResidual residual) { core.SetFullRNormRes(residual); } + + /// Sets callback function for time grid + void SetTimeGrid(braid_PtFcnTimeGrid tgrid) { core.SetTimeGrid(tgrid); } public: /// Gets the number of iterations (XBraid style) @@ -221,6 +237,9 @@ namespace gismo { /// Gets the total number of levels (XBraid style) void GetNLevels(braid_Int *nlevels_ptr) { core.GetNLevels(nlevels_ptr); } + /// Gets the MPI process ID + void GetMyID(braid_Int *myid_ptr) { core.GetMyID(myid_ptr); } + /// Returns the number of iterations braid_Int iterations() { braid_Int niter; @@ -241,6 +260,13 @@ namespace gismo { GetNLevels(&nlevels); return nlevels; } + + /// Returns the MPI process ID + braid_Int id() { + braid_Int myid; + GetMyID(&myid); + return myid; + } protected: /// Braid Core object @@ -249,7 +275,7 @@ namespace gismo { /** - \brief Specializations for gsXBraid< gsMatrix > + \brief Specializations for gsXBraid> */ template class gsXBraid< gsMatrix > : public gsXBraid @@ -268,9 +294,9 @@ namespace gismo { virtual braid_Int Clone(braid_Vector u, braid_Vector *v_ptr) { - gsMatrix* _u = (gsMatrix*) u; - gsMatrix* v = new gsMatrix(); - (*v) = (*_u); + gsMatrix* u_ptr = (gsMatrix*) u; + gsMatrix* v = new gsMatrix(); + (*v) = (*u_ptr); *v_ptr = (braid_Vector) v; return braid_Int(0); } @@ -278,8 +304,8 @@ namespace gismo { /// Frees a given vector virtual braid_Int Free(braid_Vector u) { - gsMatrix* _u = (gsMatrix*) u; - delete _u; + gsMatrix* u_ptr = (gsMatrix*) u; + delete u_ptr; return braid_Int(0); } @@ -289,9 +315,9 @@ namespace gismo { braid_Real beta, braid_Vector y) { - gsMatrix* _x = (gsMatrix*) x; - gsMatrix* _y = (gsMatrix*) y; - *_y = (T)alpha * (*_x) + (T)beta * (*_y); + gsMatrix* x_ptr = (gsMatrix*) x; + gsMatrix* y_ptr = (gsMatrix*) y; + *y_ptr = (T)alpha * (*x_ptr) + (T)beta * (*y_ptr); return braid_Int(0); } @@ -299,8 +325,8 @@ namespace gismo { virtual braid_Int SpatialNorm(braid_Vector u, braid_Real *norm_ptr) { - gsMatrix *_u = (gsMatrix*) u; - *norm_ptr = _u->norm(); + gsMatrix *u_ptr = (gsMatrix*) u; + *norm_ptr = u_ptr->norm(); return braid_Int(0); } @@ -309,15 +335,15 @@ namespace gismo { void *buffer, BraidBufferStatus &status) { - gsMatrix *_u = (gsMatrix*) u; - T* _buffer = (T*) buffer; - T* _data = _u->data(); - index_t size = _u->rows()*_u->cols(); + gsMatrix *u_ptr = (gsMatrix*) u; + T* buffer_ptr = (T*) buffer; + T* data_ptr = u_ptr->data(); + index_t size = u_ptr->rows()*u_ptr->cols(); - _buffer[0] = _u->rows(); - _buffer[1] = _u->cols(); + buffer_ptr[0] = u_ptr->rows(); + buffer_ptr[1] = u_ptr->cols(); for (index_t idx = 0; idx < size; ++idx) - _buffer[idx+2] = _data[idx]; + buffer_ptr[idx+2] = data_ptr[idx]; status.SetSize(sizeof(T)*(size+2)); return braid_Int(0); @@ -328,26 +354,25 @@ namespace gismo { braid_Vector *u_ptr, BraidBufferStatus &status) { - T* _buffer = (T*) buffer; - index_t rows = _buffer[0]; - index_t cols = _buffer[1]; + T* buffer_ptr = (T*) buffer; + index_t rows = buffer_ptr[0]; + index_t cols = buffer_ptr[1]; gsMatrix* u = new gsMatrix(rows,cols); - T* _data = u->data(); + T* data_ptr = u->data(); for (index_t idx = 0; idx < rows*cols; ++idx) - _data[idx] = _buffer[idx+2]; + data_ptr[idx] = buffer_ptr[idx+2]; *u_ptr = (braid_Vector) u; return braid_Int(0); } }; - /** - \brief Specializations for gsXBraid< std::vector< gsMatrix > > + \brief Specializations for gsXBraid> */ template - class gsXBraid< std::vector< gsMatrix > > : public gsXBraid + class gsXBraid< gsVector > : public gsXBraid { public: /// Constructor @@ -363,12 +388,9 @@ namespace gismo { virtual braid_Int Clone(braid_Vector u, braid_Vector *v_ptr) { - std::vector< gsMatrix >* _u = (std::vector< gsMatrix >*) u; - std::vector< gsMatrix >* v = new std::vector< gsMatrix >(); - - for (typename std::vector< gsMatrix >::const_iterator it = _u->cbegin(); - it != _u->cend(); ++it) - v->push_back( *it ); + gsVector* u_ptr = (gsVector*) u; + gsVector* v = new gsVector(); + (*v) = (*u_ptr); *v_ptr = (braid_Vector) v; return braid_Int(0); } @@ -376,8 +398,8 @@ namespace gismo { /// Frees a given vector virtual braid_Int Free(braid_Vector u) { - gsMatrix* _u = (gsMatrix*) u; - delete _u; + gsVector* u_ptr = (gsVector*) u; + delete u_ptr; return braid_Int(0); } @@ -387,9 +409,9 @@ namespace gismo { braid_Real beta, braid_Vector y) { - gsMatrix* _x = (gsMatrix*) x; - gsMatrix* _y = (gsMatrix*) y; - *_y = (T)alpha * (*_x) + (T)beta * (*_y); + gsVector* x_ptr = (gsVector*) x; + gsVector* y_ptr = (gsVector*) y; + *y_ptr = (T)alpha * (*x_ptr) + (T)beta * (*y_ptr); return braid_Int(0); } @@ -397,8 +419,8 @@ namespace gismo { virtual braid_Int SpatialNorm(braid_Vector u, braid_Real *norm_ptr) { - gsMatrix *_u = (gsMatrix*) u; - *norm_ptr = _u->norm(); + gsVector *u_ptr = (gsVector*) u; + *norm_ptr = u_ptr->norm(); return braid_Int(0); } @@ -407,17 +429,16 @@ namespace gismo { void *buffer, BraidBufferStatus &status) { - gsMatrix *_u = (gsMatrix*) u; - T* _buffer = (T*) buffer; - T* _data = _u->data(); - index_t size = _u->rows()*_u->cols(); + gsVector *u_ptr = (gsVector*) u; + T* buffer_ptr = (T*) buffer; + T* data_ptr = u_ptr->data(); + index_t size = u_ptr->size(); - _buffer[0] = _u->rows(); - _buffer[1] = _u->cols(); + buffer_ptr[0] = u_ptr->size(); for (index_t idx = 0; idx < size; ++idx) - _buffer[idx+2] = _data[idx]; + buffer_ptr[idx+1] = data_ptr[idx]; - status.SetSize(sizeof(T)*(size+2)); + status.SetSize(sizeof(T)*(size+1)); return braid_Int(0); } @@ -426,21 +447,19 @@ namespace gismo { braid_Vector *u_ptr, BraidBufferStatus &status) { - T* _buffer = (T*) buffer; - index_t rows = _buffer[0]; - index_t cols = _buffer[1]; - gsMatrix* u = new gsMatrix(rows,cols); - T* _data = u->data(); + T* buffer_ptr = (T*) buffer; + index_t size = buffer_ptr[0]; + gsVector* u = new gsVector(size); + T* data_ptr = u->data(); - for (index_t idx = 0; idx < rows*cols; ++idx) - _data[idx] = _buffer[idx+2]; + for (index_t idx = 0; idx < size; ++idx) + data_ptr[idx] = buffer_ptr[idx+1]; *u_ptr = (braid_Vector) u; return braid_Int(0); } }; - /** \brief Class defining the XBraid access status wrapper @@ -478,6 +497,13 @@ namespace gismo { return nref; } + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + /// Returns the total number of time instances braid_Int times() { braid_Int ntpoints; @@ -498,13 +524,6 @@ namespace gismo { GetCallingFunction(&callingfcn); return callingfcn; } - - /// Returns the current time instance - braid_Real time() { - braid_Real t; - GetT(&t); - return t; - } /// Returns the index of the time instance braid_Int timeIndex() { @@ -607,6 +626,270 @@ namespace gismo { return numerrorest; } }; + + /** + \brief Class defining the XBraid step status wrapper + + The wrapper provides all functionality of the BraidStepStatus + class plus some functions that return the information by value + */ + class gsXBraidStepStatus : public BraidStepStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the end of the time interval + braid_Real timeStop() { + braid_Real t; + GetTstop(&t); + return t; + } + + /// Returns the time interval + std::pair timeInterval() { + std::pair t; + GetTstartTstop(&t.first, &t.second); + return t; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// Returns the tolerance + braid_Real tol() { + braid_Real t; + GetTol(&t); + return t; + } + + /// Returns the old tolerence for the fine-grid solver + braid_Real tolFine() { + braid_Real t; + GetOldFineTolx(&t); + return t; + } + + /// Returns the estimated error + braid_Real error() { + braid_Real errorest; + GetSingleErrorEstStep(&errorest); + return errorest; + } + + /// Returns the spatial accuracy + braid_Real accuracy(braid_Real loose_tol, braid_Real tight_tol) { + braid_Real tol; + GetSpatialAccuracy(loose_tol, tight_tol, &tol); + return tol; + } + }; + + /** + \brief Class defining the XBraid coarsen and refinement status wrapper + + The wrapper provides all functionality of the BraidCoarsenRefStatus + class plus some functions that return the information by value + */ + class gsXBraidCoarsenRefStatus : public BraidCoarsenRefStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + + /// Returns the end of the fine time interval + braid_Real ftimeStop() { + braid_Real t; + GetFTstop(&t); + return t; + } + + /// Returns the start of the fine time interval + braid_Real ftimeStart() { + braid_Real t; + GetFTprior(&t); + return t; + } + + /// Returns the end of the coarse time interval + braid_Real ctimeStop() { + braid_Real t; + GetCTstop(&t); + return t; + } + + /// Returns the start of the coarse time interval + braid_Real ctimeStart() { + braid_Real t; + GetCTprior(&t); + return t; + } + }; + + /** + \brief Class defining the XBraid buffer status wrapper + + The wrapper provides all functionality of the BraidBufferStatus + class plus some functions that return the information by value + */ + class gsXBraidBufferStatus : public BraidBufferStatus + { + public: + /// Returns the message type + braid_Int type() { + braid_Int msg; + GetMessageType(&msg); + return msg; + } + }; + + /** + \brief Class defining the XBraid step objective wrapper + + The wrapper provides all functionality of the BraidObjectiveStatus + class plus some functions that return the information by value + */ + class gsXBraidObjectiveStatus : public BraidObjectiveStatus + { + public: + /// Returns the number of iterations + braid_Int iterations() { + braid_Int iter; + GetIter(&iter); + return iter; + } + + /// Returns the current multigrid level + braid_Int level() { + braid_Int level; + GetLevel(&level); + return level; + } + + /// Returns the total number of multigrid levels + braid_Int levels() { + braid_Int nlevels; + GetNLevels(&nlevels); + return nlevels; + } + + /// Returns the total number of refinements + braid_Int refines() { + braid_Int nref; + GetNRefine(&nref); + return nref; + } + + /// Returns the current time instance + braid_Real time() { + braid_Real t; + GetT(&t); + return t; + } + + /// Returns the total number of time instances + braid_Int times() { + braid_Int ntpoints; + GetNTPoints(&ntpoints); + return ntpoints; + } + + /// Returns the index of the time instance + braid_Int timeIndex() { + braid_Int tindex; + GetTIndex(&tindex); + return tindex; + } + }; }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid.hpp b/extensions/gsXBraid/gsXBraid.hpp index f8e91a2fe0..8e806b5f6f 100644 --- a/extensions/gsXBraid/gsXBraid.hpp +++ b/extensions/gsXBraid/gsXBraid.hpp @@ -45,19 +45,19 @@ namespace gismo { template gsXBraid< gsMatrix >::~gsXBraid() {} - - // Constructor + + // Constructor template - gsXBraid< std::vector< gsMatrix > >::gsXBraid(const gsMpiComm& comm, - const braid_Real tstart, - const braid_Real tstop, - braid_Int ntime) + gsXBraid< gsVector >::gsXBraid(const gsMpiComm& comm, + const braid_Real tstart, + const braid_Real tstop, + braid_Int ntime) : gsXBraid(comm, tstart, tstop, ntime) {} // Destructor template - gsXBraid< std::vector< gsMatrix > >::~gsXBraid() + gsXBraid< gsVector >::~gsXBraid() {} - + }// namespace gismo diff --git a/extensions/gsXBraid/gsXBraid_.cpp b/extensions/gsXBraid/gsXBraid_.cpp index 9df8a35329..97c0d37495 100644 --- a/extensions/gsXBraid/gsXBraid_.cpp +++ b/extensions/gsXBraid/gsXBraid_.cpp @@ -8,6 +8,6 @@ namespace gismo CLASS_TEMPLATE_INST gsXBraid; CLASS_TEMPLATE_INST gsXBraid< gsMatrix >; - CLASS_TEMPLATE_INST gsXBraid< std::vector< gsMatrix > >; + CLASS_TEMPLATE_INST gsXBraid< gsVector >; } From c01161f95f8089706e69d47920e87249b9cc7aa2 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 20 Jan 2021 20:09:04 +0100 Subject: [PATCH 022/174] Fixed minor bugs in XBraid example --- examples/xbraid_heatEquation_example.cpp | 71 +++++++++++------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index cb84c46f70..03ddce989d 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -24,7 +24,7 @@ namespace gismo { \brief Derived class implementing the XBraid wrapper for the heat equation */ template -class gsXBraid_app : public gsXBraid > +class gsXBraid_app : public gsXBraid< gsVector > { private: // Spatial discretisation parameters @@ -47,7 +47,7 @@ class gsXBraid_app : public gsXBraid > gsConstantFunction f; // Solution - gsMatrix sol; + gsVector sol; typedef typename gsExprAssembler::geometryMap geometryMap; typedef typename gsExprAssembler::variable variable; @@ -62,7 +62,7 @@ class gsXBraid_app : public gsXBraid > index_t numTime, index_t numRefine, index_t numElevate) - : gsXBraid >::gsXBraid(comm, tstart, tstop, (int)numTime), + : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numTime), numRefine(numRefine), numElevate(numElevate), numTime(numTime), @@ -78,11 +78,8 @@ class gsXBraid_app : public gsXBraid > ///////////////////////////////////////////////////////////////////////////////////////////// // Code for heat equation starts here // ///////////////////////////////////////////////////////////////////////////////////////////// - - // Source function - gsInfo << "Source function is: "<< f << "\n"; - // Define Geometry, must be a gsMultiPatch object + // Define geometry, must be a gsMultiPatch object patches.computeTopology(); // Boundary conditions @@ -110,9 +107,6 @@ class gsXBraid_app : public gsXBraid > for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); - // Generate system matrix and load vector - gsInfo << "Assembling mass and stiffness...\n"; - // Set the basis K.setIntegrationElements(bases); M.setIntegrationElements(bases); @@ -145,21 +139,23 @@ class gsXBraid_app : public gsXBraid > variable g_Neumann = K.getBdrFunction(); K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); - gsSparseSolver<>::CGDiagonal solver; - sol.setZero(M.numDofs(), 1); - - for ( int i = 1; i<=numTime; ++i) // for all timesteps - { + if (this->id() == 0) { + gsStopwatch clock; + clock.restart(); + + gsSparseSolver<>::CGDiagonal solver; + sol.setZero(M.numDofs()); + + for ( int i = 1; i<=numTime; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - gsInfo << "Solving timestep " << i*tstep << ".\n"; sol = solver.compute(M.matrix() + tstep*theta*K.matrix() ).solve(tstep*K.rhs() + (M.matrix()-tstep*(1.0-theta)*K.matrix())*sol); + + gsInfo << "norm of the solution = " << sol.norm() << "\n" + << "wall time = " << clock.stop() << std::endl; } - - gsInfo << "Norm of the solution" << std::endl; - gsInfo << sol.norm() << std::endl; } /// Destructor @@ -276,17 +272,17 @@ class gsXBraid_app : public gsXBraid > } /// Initializes a vector - braid_Int Init(braid_Real t, + braid_Int Init(braid_Real t, braid_Vector *u_ptr) { - gsMatrix* u = new gsMatrix(M.numDofs(), 1); + gsVector* u = new gsVector(M.numDofs()); if (t != tstart) { // Intermediate solution - u->setZero(M.numDofs(), 1); + u->setZero(M.numDofs()); } else { // Initial solution - u->setZero(M.numDofs(), 1); + u->setZero(M.numDofs()); } *u_ptr = (braid_Vector) u; @@ -299,19 +295,19 @@ class gsXBraid_app : public gsXBraid > braid_Vector fstop, BraidStepStatus &pstatus) { - gsMatrix* _u = (gsMatrix*) u; - T tstart, tstop; + gsVector* u_ptr = (gsVector*) u; // Get time step information - pstatus.GetTstartTstop(&tstart, &tstop); - T tstep(tstop - tstart); - + std::pair time = + static_cast(pstatus).timeInterval(); + T tstep(time.second - time.first); + // Solve spatial problem gsSparseSolver<>::CGDiagonal solver; - *_u = solver.compute(M.matrix() + - tstep*theta*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*_u)); + *u_ptr = solver.compute(M.matrix() + + tstep*theta*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*u_ptr)); // no refinement pstatus.SetRFactor(1); return braid_Int(0); @@ -321,21 +317,20 @@ class gsXBraid_app : public gsXBraid > braid_Int BufSize(braid_Int *size_ptr, BraidBufferStatus &status) { - *size_ptr = sizeof(T)*(M.numDofs()+1); + *size_ptr = sizeof(T)*(M.numDofs()+2); return braid_Int(0); } - /// Handles + /// Handles access for input/output braid_Int Access(braid_Vector u, BraidAccessStatus &astatus) { if(static_cast(astatus).done() && static_cast(astatus).timeIndex() == static_cast(astatus).times()) { - gsMatrix* _u = (gsMatrix*) u; - gsInfo << "Norm of the solution" << std::endl; - gsInfo << _u->norm() << std::endl; - } + gsVector* u_ptr = (gsVector*) u; + gsInfo << "norm of the solution = " << u_ptr->norm() << std::endl; + } return braid_Int(0); } From 0bd46f9764bb8d5124eb56eae63c377b85581612 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 21 Jan 2021 06:46:19 +0100 Subject: [PATCH 023/174] Minor improvements of gsXBraid extension --- extensions/gsXBraid/gsXBraid.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/extensions/gsXBraid/gsXBraid.h b/extensions/gsXBraid/gsXBraid.h index 271fc705bf..b3d9c3f236 100644 --- a/extensions/gsXBraid/gsXBraid.h +++ b/extensions/gsXBraid/gsXBraid.h @@ -82,8 +82,12 @@ namespace gismo { /// Destructor virtual ~gsXBraid(); - /// Free - virtual braid_Int Free(braid_Vector u) { return braid_Int(0); } + /// Frees the given vector (dummy method) + virtual braid_Int Free(braid_Vector) { return braid_Int(0); } + + /// Computes the residual (dummy method) + virtual braid_Int Residual(braid_Vector, braid_Vector, BraidStepStatus&) + { GISMO_NO_IMPLEMENTATION } /// Runs the parallel-in-time multigrid solver void solve() { core.Drive(); } @@ -215,7 +219,7 @@ namespace gismo { /// Sets the file input/output level void SetFileIOLevel(braid_Int io_level) { core.SetFileIOLevel(io_level); } - /// Sets the coarse-relaxation weight + /// Sets the C-relaxation weight void SetCRelaxWt(braid_Int level, braid_Real Cwt) { core.SetCRelaxWt(level, Cwt); } /// Sets the time cutoff @@ -290,18 +294,18 @@ namespace gismo { /// Destructor virtual ~gsXBraid(); - /// Clones a given vector + /// Clones the given vector virtual braid_Int Clone(braid_Vector u, braid_Vector *v_ptr) { gsMatrix* u_ptr = (gsMatrix*) u; gsMatrix* v = new gsMatrix(); - (*v) = (*u_ptr); + *v = *u_ptr; *v_ptr = (braid_Vector) v; return braid_Int(0); } - /// Frees a given vector + /// Frees the given vector virtual braid_Int Free(braid_Vector u) { gsMatrix* u_ptr = (gsMatrix*) u; @@ -321,7 +325,7 @@ namespace gismo { return braid_Int(0); } - /// Computes the spatial norm of a given vector + /// Computes the spatial norm of the given vector virtual braid_Int SpatialNorm(braid_Vector u, braid_Real *norm_ptr) { @@ -384,18 +388,18 @@ namespace gismo { /// Destructor virtual ~gsXBraid(); - /// Clones a given vector + /// Clones the given vector virtual braid_Int Clone(braid_Vector u, braid_Vector *v_ptr) { gsVector* u_ptr = (gsVector*) u; gsVector* v = new gsVector(); - (*v) = (*u_ptr); + *v = *u_ptr; *v_ptr = (braid_Vector) v; return braid_Int(0); } - /// Frees a given vector + /// Frees the given vector virtual braid_Int Free(braid_Vector u) { gsVector* u_ptr = (gsVector*) u; @@ -415,7 +419,7 @@ namespace gismo { return braid_Int(0); } - /// Computes the spatial norm of a given vector + /// Computes the spatial norm of the given vector virtual braid_Int SpatialNorm(braid_Vector u, braid_Real *norm_ptr) { From 36b5d848db547d7d02214bf8826134df02c57d26 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 21 Jan 2021 06:46:50 +0100 Subject: [PATCH 024/174] Improvements of XBraid heat equation example --- examples/heatEquation_example2.cpp | 2 +- examples/xbraid_heatEquation_example.cpp | 101 ++++++++++++++++------- 2 files changed, 72 insertions(+), 31 deletions(-) diff --git a/examples/heatEquation_example2.cpp b/examples/heatEquation_example2.cpp index 9044c8e41b..15ff32a28c 100644 --- a/examples/heatEquation_example2.cpp +++ b/examples/heatEquation_example2.cpp @@ -64,7 +64,7 @@ int main(int argc, char *argv[]) for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); - real_t theta = 0.0; + real_t theta = 0.5; real_t endTime = 0.1; int numSteps = 40; diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index 03ddce989d..52e44f8571 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -59,6 +59,7 @@ class gsXBraid_app : public gsXBraid< gsVector > gsXBraid_app(const gsMpiComm& comm, const T& tstart, const T& tstop, + const T& theta, index_t numTime, index_t numRefine, index_t numElevate) @@ -68,7 +69,7 @@ class gsXBraid_app : public gsXBraid< gsVector > numTime(numTime), tstart(tstart), tstop(tstop), - theta(0.0), + theta(theta), tstep( (tstop-tstart)/numTime ), patches(*gsNurbsCreator<>::BSplineSquareDeg(2)), bases(patches), @@ -175,11 +176,12 @@ class gsXBraid_app : public gsXBraid< gsVector > // Temporal discretisation parameters index_t numTime = 40; + T theta = 0.5; T tfinal = 0.1; // Parallel-in-time multigrid parameters index_t CFactor = 2; - index_t info = 2; + index_t access = 1; index_t maxIter = 100; index_t maxLevel = 30; index_t minCLevel = 2; @@ -188,6 +190,7 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numMaxRef = 1; index_t numRelax = 1; index_t numStorage =-1; + index_t print = 2; index_t tnorm = 2; // 1-norm, 2-norm, inf-norm T absTol = 1e-10; @@ -213,22 +216,24 @@ class gsXBraid_app : public gsXBraid< gsVector > // Temporal diescretisation parameters cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); cmd.addReal( "t", "time", "Final time", tfinal ); + cmd.addReal( "T", "theta", "Implicitness parameter of the two-level theta scheme", theta); // Parallel-in-time multigrid parameters + cmd.addInt( "", "numStorage", "Number of storage of the parallel-in-time multigrid solver", numStorage ); + cmd.addInt( "A", "access", "Access level (neve [=0], =after finished [=1(default)], each iteration [=2]", access ); cmd.addInt( "C", "CFactor", "Coarsening factor of the parallel-in-time multigrid solver", CFactor ); - cmd.addInt( "I", "info", "Print level (no output [=0], =runtime inforation [=1], run statistics [=2(default)], debug [=3])", info ); - cmd.addInt( "M", "maxIter", "Maximum iteration numbers of the parallel-in-time multigrid solver", maxIter ); - cmd.addInt( "L", "maxLevel", "Maximum numbers of parallel-in-time multigrid levels", maxLevel ); - cmd.addInt( "l", "minCLevel", "Minimum level of the parallel-in-time multigrid solver", minCLevel ); cmd.addInt( "F", "numFMG", "Number of full multigrid steps of the parallel-in-time multigrid solver", numFMG ); - cmd.addInt( "V", "numFMGVcyc", "Number of full multigrid V-cycles of the parallel-in-time multigrid solver", numFMGVcyc ); + cmd.addInt( "L", "maxLevel", "Maximum numbers of parallel-in-time multigrid levels", maxLevel ); + cmd.addInt( "M", "maxIter", "Maximum iteration numbers of the parallel-in-time multigrid solver", maxIter ); + cmd.addInt( "N", "norm", "Temporal norm of the parallel-in-time multigrid solver (1-norm [=1], 2-norm [=2(default)], inf-norm [=3])", tnorm ); + cmd.addInt( "P", "print", "Print level (no output [=0], =runtime inforation [=1], run statistics [=2(default)], debug [=3])", print ); cmd.addInt( "R", "numMaxRef", "Maximum number of refinements of the parallel-in-time multigrid solver", numMaxRef ); + cmd.addInt( "V", "numFMGVcyc", "Number of full multigrid V-cycles of the parallel-in-time multigrid solver", numFMGVcyc ); cmd.addInt( "X", "numRelax", "Number of relaxation steps of the parallel-in-time multigrid solver", numRelax ); - cmd.addInt( "", "numStorage", "Number of storage of the parallel-in-time multigrid solver", numStorage ); - cmd.addInt( "T", "tnorm", "Temporal norm of the parallel-in-time multigrid solver (1-norm [=1], 2-norm [=2(default)], inf-norm [=3])", tnorm ); + cmd.addInt( "l", "minCLevel", "Minimum level of the parallel-in-time multigrid solver", minCLevel ); - cmd.addReal( "", "absTol", "Absolute tolerance of the parallel-in-time multigrid solver", absTol ); - cmd.addReal( "", "relTol", "Relative tolerance of the parallel-in-time multigrid solver", relTol ); + cmd.addReal( "", "absTol", "Absolute tolerance of the parallel-in-time multigrid solver", absTol ); + cmd.addReal( "", "relTol", "Relative tolerance of the parallel-in-time multigrid solver", relTol ); cmd.addSwitch( "fmg" , "Perform full multigrid (default is off)", fmg); cmd.addSwitch( "incrMaxLevels" , "Increase the maximum number of parallel-in-time multigrid levels after performing a refinement (default is off)", incrMaxLevels); @@ -240,7 +245,7 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.getValues(argc,argv); // Create instance - gsXBraid_app app(comm, 0.0, tfinal, numTime, numRefine, numElevate); + gsXBraid_app app(comm, 0.0, tfinal, theta, numTime, numRefine, numElevate); if (absTol != 1e-10) app.SetAbsTol(absTol); @@ -257,17 +262,20 @@ class gsXBraid_app : public gsXBraid< gsVector > app.SetNFMG(numFMG); app.SetNFMGVcyc(numFMGVcyc); app.SetNRelax(numRelax); - app.SetPrintLevel(info); + app.SetAccessLevel(access); + app.SetPrintLevel(print); app.SetStorage(numStorage); app.SetTemporalNorm(tnorm); - if (fmg) app.SetFMG(); + if (fmg) app.SetFMG(); if (incrMaxLevels) app.SetIncrMaxLevels(); - if (periodic) app.SetPeriodic(1); else app.SetPeriodic(0); - if (refine) app.SetRefine(1); else app.SetRefine(0); - if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); - if (skip) app.SetSkip(1); else app.SetSkip(0); - + if (periodic) app.SetPeriodic(1); else app.SetPeriodic(0); + if (refine) app.SetRefine(1); else app.SetRefine(0); + if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); + if (skip) app.SetSkip(1); else app.SetSkip(0); + + //app.SetSpatialCoarsenAndRefine(); + return app; } @@ -296,6 +304,13 @@ class gsXBraid_app : public gsXBraid< gsVector > BraidStepStatus &pstatus) { gsVector* u_ptr = (gsVector*) u; + gsVector* ustop_ptr = (gsVector*) ustop; + + // XBraid forcing + if (fstop != NULL) { + gsVector* fstop_ptr = (gsVector*) fstop; + *u_ptr += *fstop_ptr; + } // Get time step information std::pair time = @@ -306,10 +321,20 @@ class gsXBraid_app : public gsXBraid< gsVector > gsSparseSolver<>::CGDiagonal solver; *u_ptr = solver.compute(M.matrix() + tstep*theta*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*u_ptr)); - // no refinement - pstatus.SetRFactor(1); + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*u_ptr), + *ustop_ptr); + + // Carry out adaptive refinement in time + if (static_cast(pstatus).level() == 0) { + braid_Real error = static_cast(pstatus).error(); + if (error != braid_Real(-1.0)) { + braid_Int rfactor = (braid_Int) std::ceil( std::sqrt( error / 1e-3) ); + pstatus.SetRFactor(rfactor); + } else + pstatus.SetRFactor(1); + } + return braid_Int(0); } @@ -333,16 +358,32 @@ class gsXBraid_app : public gsXBraid< gsVector > } return braid_Int(0); } + + /// Performs spatial coarsening + virtual int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) { + gsInfo << "Coarsen\n"; + gsVector *fu_ptr = (gsVector*) fu; + gsVector* cu = new gsVector(); + *cu = *fu_ptr; + *cu_ptr = (braid_Vector) cu; + return 0; + } - // Not needed in this example - braid_Int Residual(braid_Vector u, - braid_Vector r, - BraidStepStatus &pstatus) - { - return braid_Int(0); + // Performs spatial refinement + virtual int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) { + gsInfo << "Refine\n"; + gsVector *cu_ptr = (gsVector*) cu; + gsVector* fu = new gsVector(); + *fu = *cu_ptr; + *fu_ptr = (braid_Vector) fu; + return 0; } }; - + } // ending namespace gismo #endif From 7d804ba9eb66b2bfbbc5082880c342ec2181f19b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 22 Mar 2021 09:53:35 +0100 Subject: [PATCH 025/174] Improvements of XBraid heat equation example --- examples/xbraid_heatEquation_example.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/xbraid_heatEquation_example.cpp b/examples/xbraid_heatEquation_example.cpp index 52e44f8571..802fa545dd 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/examples/xbraid_heatEquation_example.cpp @@ -28,7 +28,7 @@ class gsXBraid_app : public gsXBraid< gsVector > { private: // Spatial discretisation parameters - index_t numRefine, numElevate; + index_t numRefine, numElevate, numIncrease; // Temporal discretisation parameters index_t numTime; @@ -62,10 +62,12 @@ class gsXBraid_app : public gsXBraid< gsVector > const T& theta, index_t numTime, index_t numRefine, - index_t numElevate) + index_t numElevate, + index_t numIncrease) : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numTime), numRefine(numRefine), numElevate(numElevate), + numIncrease(numIncrease), numTime(numTime), tstart(tstart), tstop(tstop), @@ -104,6 +106,10 @@ class gsXBraid_app : public gsXBraid< gsVector > bases.setDegree(tmp); } + // Increase and p-refine the basis + if (numIncrease >0) + bases.degreeIncrease(numIncrease); + // h-refine the basis for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); @@ -171,8 +177,9 @@ class gsXBraid_app : public gsXBraid< gsVector > std::string fn("pde/poisson2d_bvp.xml"); // Spatial discretisation parameters - index_t numRefine = 2; - index_t numElevate = 0; + index_t numRefine = 2; + index_t numElevate = 0; + index_t numIncrease = 0; // Temporal discretisation parameters index_t numTime = 40; @@ -211,6 +218,8 @@ class gsXBraid_app : public gsXBraid< gsVector > // Spatial discretisation parameters cmd.addInt( "e", "degreeElevation", "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); + cmd.addInt( "i", "degreeIncrease", + "Number of degree increase steps to perform before solving (0: equalize degree in all directions)", numIncrease ); cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); // Temporal diescretisation parameters @@ -245,7 +254,7 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.getValues(argc,argv); // Create instance - gsXBraid_app app(comm, 0.0, tfinal, theta, numTime, numRefine, numElevate); + gsXBraid_app app(comm, 0.0, tfinal, theta, numTime, numRefine, numElevate, numIncrease); if (absTol != 1e-10) app.SetAbsTol(absTol); From 29bf530b4556305df45d5559910f1dcaf05cb631 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 21 Apr 2021 22:13:22 +0200 Subject: [PATCH 026/174] Updated XBraid example --- extensions/gsXBraid/CMakeLists.txt | 17 ++ .../examples}/xbraid_heatEquation_example.cpp | 203 +++++++++++++----- 2 files changed, 172 insertions(+), 48 deletions(-) rename {examples => extensions/gsXBraid/examples}/xbraid_heatEquation_example.cpp (66%) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 258810e2d0..02d8b285bf 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -128,3 +128,20 @@ set (GISMO_INCLUDE_DIRS ${GISMO_INCLUDE_DIRS} ${XBRAID_INCLUDE_DIR} install(DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION include/gismo/gsXBraid FILES_MATCHING PATTERN "*.h") + +# Add example files +include_directories(${CODIPACK_INCLUDE_DIR}) +aux_cpp_directory(${CMAKE_CURRENT_SOURCE_DIR}/examples FILES) +foreach(file ${FILES}) + add_gismo_executable(${file}) + get_filename_component(tarname ${file} NAME_WE) # name without extension + set_property(TEST ${tarname} PROPERTY LABELS "${PROJECT_NAME}") + set_target_properties(${tarname} PROPERTIES FOLDER "${PROJECT_NAME}") + if( GISMO_WITH_MPI ) + target_include_directories(${tarname} PRIVATE ${MPI_INCLUDE_PATH}) + endif() + # Install the example executables (optionally) + install(TARGETS ${tarname} DESTINATION "${BIN_INSTALL_DIR}" COMPONENT exe OPTIONAL) +endforeach(file ${FILES}) + +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin/) diff --git a/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp similarity index 66% rename from examples/xbraid_heatEquation_example.cpp rename to extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index 802fa545dd..9df302d1be 100644 --- a/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -20,6 +20,15 @@ using namespace gismo; namespace gismo { + enum class gsXBraid_tmethod + { + FE_FE = 1, // forward Euler (all grids) + BE_BE = 2, // backward Euler (all grids) + CN_CN = 3, // Crank-Nicholson (all grids) + FE_BE = 4, // forward Euler (fine grid), backward Euler (coarser grids) + CN_BE = 5 // Crank-Nicholson (fine grid), backward Euler (coarser grids) + }; + /** \brief Derived class implementing the XBraid wrapper for the heat equation */ @@ -31,8 +40,8 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numRefine, numElevate, numIncrease; // Temporal discretisation parameters - index_t numTime; - T tstart, tstop, theta, tstep; + index_t numSteps, tmethod; + T tstart, tstop, tstep; // Spatial discretization gsMultiPatch patches; @@ -59,20 +68,20 @@ class gsXBraid_app : public gsXBraid< gsVector > gsXBraid_app(const gsMpiComm& comm, const T& tstart, const T& tstop, - const T& theta, - index_t numTime, + index_t tmethod, + index_t numSteps, index_t numRefine, index_t numElevate, index_t numIncrease) - : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numTime), + : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numSteps), numRefine(numRefine), numElevate(numElevate), numIncrease(numIncrease), - numTime(numTime), + numSteps(numSteps), + tmethod(tmethod), tstart(tstart), tstop(tstop), - theta(theta), - tstep( (tstop-tstart)/numTime ), + tstep( (tstop-tstart)/numSteps ), patches(*gsNurbsCreator<>::BSplineSquareDeg(2)), bases(patches), g_D(0,2), g_N(1,2), @@ -152,16 +161,54 @@ class gsXBraid_app : public gsXBraid< gsVector > gsSparseSolver<>::CGDiagonal solver; sol.setZero(M.numDofs()); - - for ( int i = 1; i<=numTime; ++i) // for all timesteps - // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = solver.compute(M.matrix() + - tstep*theta*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*(1.0-theta)*K.matrix())*sol); - - gsInfo << "norm of the solution = " << sol.norm() << "\n" - << "wall time = " << clock.stop() << std::endl; + + switch((gsXBraid_tmethod)tmethod) { + case gsXBraid_tmethod::FE_FE: + case gsXBraid_tmethod::FE_BE: + // Forward Euler method + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = solver.compute(M.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*sol); + + gsInfo << "norm of the solution = " << sol.norm() << "\n" + << "wall time = " << clock.stop() << std::endl; + break; + + case gsXBraid_tmethod::BE_BE: + // Backward Euler method + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = solver.compute(M.matrix() + + tstep*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix())*sol); + + gsInfo << "norm of the solution = " << sol.norm() << "\n" + << "wall time = " << clock.stop() << std::endl; + break; + + case gsXBraid_tmethod::CN_CN: + case gsXBraid_tmethod::CN_BE: + // Crank-Nicholson method + + for ( int i = 1; i<=numSteps; ++i) // for all timesteps + // Compute the system for the timestep i (rhs is assumed constant wrt time) + sol = solver.compute(M.matrix() + + tstep*0.5*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*sol); + + gsInfo << "norm of the solution = " << sol.norm() << "\n" + << "wall time = " << clock.stop() << std::endl; + break; + + default: + throw std::runtime_error("Unsupported time-stepping method"); + } } } @@ -177,31 +224,31 @@ class gsXBraid_app : public gsXBraid< gsVector > std::string fn("pde/poisson2d_bvp.xml"); // Spatial discretisation parameters - index_t numRefine = 2; - index_t numElevate = 0; - index_t numIncrease = 0; + index_t numRefine = 2; + index_t numElevate = 0; + index_t numIncrease = 0; // Temporal discretisation parameters - index_t numTime = 40; - T theta = 0.5; - T tfinal = 0.1; + index_t numSteps = 40; + index_t tmethod = (index_t)gsXBraid_tmethod::CN_CN; + T tfinal = 0.1; // Parallel-in-time multigrid parameters - index_t CFactor = 2; - index_t access = 1; - index_t maxIter = 100; - index_t maxLevel = 30; - index_t minCLevel = 2; - index_t numFMG = 1; - index_t numFMGVcyc = 1; - index_t numMaxRef = 1; - index_t numRelax = 1; - index_t numStorage =-1; - index_t print = 2; - index_t tnorm = 2; // 1-norm, 2-norm, inf-norm + index_t CFactor = 2; + index_t access = 1; + index_t maxIter = 100; + index_t maxLevel = 30; + index_t minCLevel = 2; + index_t numFMG = 1; + index_t numFMGVcyc = 1; + index_t numMaxRef = 1; + index_t numRelax = 1; + index_t numStorage =-1; + index_t print = 2; + index_t tnorm = 2; // 1-norm, 2-norm, inf-norm - T absTol = 1e-10; - T relTol = 1e-3; + T absTol = 1e-10; + T relTol = 1e-3; bool fmg = false; bool incrMaxLevels = false; @@ -223,9 +270,9 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.addInt( "r", "uniformRefine", "Number of uniform h-refinement steps to perform before solving", numRefine ); // Temporal diescretisation parameters - cmd.addInt( "n", "timeSteps", "Number of parallel-in-time steps", numTime ); - cmd.addReal( "t", "time", "Final time", tfinal ); - cmd.addReal( "T", "theta", "Implicitness parameter of the two-level theta scheme", theta); + cmd.addInt( "n", "numSteps", "Number of parallel-in-time steps", numSteps ); + cmd.addInt( "T", "tmethod", "Time-stepping scheme", tmethod); + cmd.addReal( "t", "tfinal", "Final time", tfinal ); // Parallel-in-time multigrid parameters cmd.addInt( "", "numStorage", "Number of storage of the parallel-in-time multigrid solver", numStorage ); @@ -254,7 +301,7 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.getValues(argc,argv); // Create instance - gsXBraid_app app(comm, 0.0, tfinal, theta, numTime, numRefine, numElevate, numIncrease); + gsXBraid_app app(comm, 0.0, tfinal, tmethod, numSteps, numRefine, numElevate, numIncrease); if (absTol != 1e-10) app.SetAbsTol(absTol); @@ -328,12 +375,72 @@ class gsXBraid_app : public gsXBraid< gsVector > // Solve spatial problem gsSparseSolver<>::CGDiagonal solver; - *u_ptr = solver.compute(M.matrix() + - tstep*theta*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*(1.0-theta)*K.matrix())*(*u_ptr), - *ustop_ptr); - + + switch((gsXBraid_tmethod)tmethod) { + case gsXBraid_tmethod::FE_FE: + // Forward Euler method (all grids) + *u_ptr = solver.compute(M.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); + break; + + case gsXBraid_tmethod::FE_BE: + if (static_cast(pstatus).level() == 0) { + // Forward Euler method (fine grid) + *u_ptr = solver.compute(M.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); + } else { + // Backward Euler method (coarse grids) + *u_ptr = solver.compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); + } + break; + + case gsXBraid_tmethod::BE_BE: + // Backward Euler method (all grids) + *u_ptr = solver.compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); + break; + + case gsXBraid_tmethod::CN_CN: + // Crank-Nicholson method (all grids) + *u_ptr = solver.compute(M.matrix() + + tstep*0.5*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); + break; + + case gsXBraid_tmethod::CN_BE: + if (static_cast(pstatus).level() == 0) { + *u_ptr = solver.compute(M.matrix() + + tstep*0.5*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); + } else { + // Backward Euler method (coarse grids) + *u_ptr = solver.compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); + } + break; + + default: + throw std::runtime_error("Unsupported time-stepping method"); + } + // Carry out adaptive refinement in time if (static_cast(pstatus).level() == 0) { braid_Real error = static_cast(pstatus).error(); From ecf94b004a8c3963ed3861e73d339a18064c2f13 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 5 May 2021 15:34:11 +0200 Subject: [PATCH 027/174] Updated README file --- extensions/gsXBraid/README.md | 78 +++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 extensions/gsXBraid/README.md diff --git a/extensions/gsXBraid/README.md b/extensions/gsXBraid/README.md new file mode 100644 index 0000000000..50aeda1c1e --- /dev/null +++ b/extensions/gsXBraid/README.md @@ -0,0 +1,78 @@ +# XBraid extension + +G+Smo extension for the [XBraid - Parallel-in-time Solver Package](https://github.com/XBraid/xbraid). + +|CMake flags|```-DGISMO_WITH_XBRAID=ON``` (default ```OFF```)| +|--:|---| +|Required additional CMake flags|| +|License|[MPL 2.0](https://www.mozilla.org/en-US/MPL/2.0/)| +|OS support|Linux, Windows, macOS| +|Status|completed| +|Developer|Matthias Möller| +|Maintainer|M.Moller@tudelft.nl| +|Last checked|05-05-2021| + +*** +__Table of content__ +1. [Introduction](#introduction) +2. [Usage example](#usage_example) +*** + +__Introdution__ + +The gsXBraid extension builds on the open-source +[XBraid](https://github.com/XBraid/xbraid) package developed at [ +Lawrence Livermore National +Laboratory](https://computation.llnl.gov/projects/parallel-time-integration-multigrid/), +and at collaborating [academic +institutions](https://github.com/XBraid/xbraid/wiki/Team). XBraid is a +non-intrusive, optimal-scaling parallel-in-time solver that builds on +multigrid reduction techniques (multigrid-reduction-in-time or MGRIT). + +The gsXBraid extension provides a generic wrapper to XBraid's C++ +interface that can be easily customized by deriving an application +from the class `gsXBraid` and overriding certain virtual methods: + +```cpp +virtual braid_Int Access(braid_Vector, BraidAccessStatus&); +virtual braid_Int BufPack(braid_Vector, void*, BraidBufferStatus&); +virtual braid_Int BufSize(braid_Int*, BraidBufferStatus&); +virtual braid_Int BufUnpack(void*, braid_Vector*, BraidBufferStatus&); +virtual braid_Int Clone(braid_Vector, braid_Vector*); +virtual braid_Int Coarsen(braid_Vector, braid_Vector*, BraidCoarsenRefStatus&); +virtual braid_Int Free(braid_Vector); +virtual braid_Int Init(braid_Real, braid_Vector*); +virtual braid_Int Refine(braid_Vector, braid_Vector*, BraidCoarsenRefStatus&); +virtual braid_Int Residual(braid_Vector, braid_Vector, BraidStepStatus&); +virtual braid_Int SpatialNorm(braid_Vector, braid_Real*); +virtual braid_Int Step(braid_Vector, braid_Vector, braid_Vector, BraidStepStatus&); +virtual braid_Int Sum(braid_Real, braid_Vector, braid_Real, braid_Vector); +``` + +__Usage example__ + +The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of the gsXBraid extension. + +1. Configuration and compilation + ```bash + mkdir build + cd build + cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON + make xbraid_heatEquation_example -j4 + ``` +2. Execution + ```bash + mpirun -np ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` + + This will solve the two-dimensional heat equation on a unit square + with 250 time steps in the time interval [0, 0.1] using + MPI processes. The spatial domain is 6 times regularly refined in + space (h-refinement) and the approximation order is increased 3 + times (p-refinement). Order elevation instead of order increase + can be achieved by replacing `-i` by `-e`. + + For a complete list of command-line argument run + ```bash + ./bin/xbraid_heatEquation_example -h + ``` From c5bde47cfd0c1378ec22cf67efdf43c201b5ed03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 5 May 2021 22:01:33 +0200 Subject: [PATCH 028/174] Update README.md --- extensions/gsXBraid/README.md | 38 +++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/extensions/gsXBraid/README.md b/extensions/gsXBraid/README.md index 50aeda1c1e..a31f71147f 100644 --- a/extensions/gsXBraid/README.md +++ b/extensions/gsXBraid/README.md @@ -4,7 +4,7 @@ G+Smo extension for the [XBraid - Parallel-in-time Solver Package](https://githu |CMake flags|```-DGISMO_WITH_XBRAID=ON``` (default ```OFF```)| |--:|---| -|Required additional CMake flags|| +|Required additional CMake flags|```-DGISMO_WITH_MPI=ON``` (recommended)
```-DGISMO_WITH_OPENMP=ON``` (optionally)| |License|[MPL 2.0](https://www.mozilla.org/en-US/MPL/2.0/)| |OS support|Linux, Windows, macOS| |Status|completed| @@ -20,7 +20,7 @@ __Table of content__ __Introdution__ -The gsXBraid extension builds on the open-source +The XBraid extension builds on the open-source [XBraid](https://github.com/XBraid/xbraid) package developed at [ Lawrence Livermore National Laboratory](https://computation.llnl.gov/projects/parallel-time-integration-multigrid/), @@ -29,9 +29,9 @@ institutions](https://github.com/XBraid/xbraid/wiki/Team). XBraid is a non-intrusive, optimal-scaling parallel-in-time solver that builds on multigrid reduction techniques (multigrid-reduction-in-time or MGRIT). -The gsXBraid extension provides a generic wrapper to XBraid's C++ +The XBraid extension provides a generic wrapper to XBraid's C++ interface that can be easily customized by deriving an application -from the class `gsXBraid` and overriding certain virtual methods: +from the class `gsXBraid` and overriding some or all virtual methods: ```cpp virtual braid_Int Access(braid_Vector, BraidAccessStatus&); @@ -53,14 +53,14 @@ __Usage example__ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of the gsXBraid extension. -1. Configuration and compilation +1. Configuration and compilation (MPI-only mode) ```bash mkdir build cd build cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON make xbraid_heatEquation_example -j4 ``` -2. Execution +2. Execution (MPI-only mode) ```bash mpirun -np ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 ``` @@ -76,3 +76,29 @@ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of th ```bash ./bin/xbraid_heatEquation_example -h ``` + +3. Configuration and compilation (MPI-OpenMP mode) + ```bash + mkdir build + cd build + cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON -DGISMO_WITH_OPENMP=ON + make xbraid_heatEquation_example -j4 + ``` + +4. Execution (MPI-OpenMP mode) + ```bash + OMP_NUM_THREADS= mpirun -np ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` + + This will solve the two-dimensional heat equation on a unit square + with 250 time steps in the time interval [0, 0.1] using + MPI processes and OpenMP threads per MPI process. + As before, the spatial domain is 6 times regularly refined in + space (h-refinement) and the approximation order is increased 3 + times (p-refinement). Order elevation instead of order increase + can be achieved by replacing `-i` by `-e`. + + For a complete list of command-line argument run + ```bash + ./bin/xbraid_heatEquation_example -h + ``` From 006e9bd23544c33d5cc23264c6a3b02afa593452 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 5 May 2021 23:15:32 +0200 Subject: [PATCH 029/174] Updated XBraid example to use XML file for problem configuration --- extensions/gsXBraid/CMakeLists.txt | 3 + .../examples/xbraid_heatEquation_example.cpp | 265 ++++++++++-------- .../filedata/pde/heat2d_square_ibvp.xml | 69 +++++ 3 files changed, 225 insertions(+), 112 deletions(-) create mode 100644 extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 02d8b285bf..146ace445d 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -129,6 +129,9 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION include/gismo/gsXBraid FILES_MATCHING PATTERN "*.h") +# add filedata folder +add_definitions(-DXBRAID_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/filedata/") + # Add example files include_directories(${CODIPACK_INCLUDE_DIR}) aux_cpp_directory(${CMAKE_CURRENT_SOURCE_DIR}/examples FILES) diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index 9df302d1be..b8afba7b80 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -20,7 +20,7 @@ using namespace gismo; namespace gismo { - enum class gsXBraid_tmethod + enum class gsXBraid_typeMethod { FE_FE = 1, // forward Euler (all grids) BE_BE = 2, // backward Euler (all grids) @@ -40,23 +40,29 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numRefine, numElevate, numIncrease; // Temporal discretisation parameters - index_t numSteps, tmethod; + index_t numSteps, typeMethod; T tstart, tstop, tstep; // Spatial discretization - gsMultiPatch patches; + gsMultiPatch mp; gsMultiBasis bases; // Boundary conditions - gsBoundaryConditions bcInfo; - gsConstantFunction g_D, g_N; + gsBoundaryConditions bc; + // Assembler options + gsOptionList Aopt; + // Expression assembler gsExprAssembler K, M; - gsConstantFunction f; + gsFunctionExpr f, u0, ms; // Solution gsVector sol; + + // Solver + typedef typename gsSparseSolver<>::CGDiagonal solver; + solver* m_solver; typedef typename gsExprAssembler::geometryMap geometryMap; typedef typename gsExprAssembler::variable variable; @@ -68,37 +74,49 @@ class gsXBraid_app : public gsXBraid< gsVector > gsXBraid_app(const gsMpiComm& comm, const T& tstart, const T& tstop, - index_t tmethod, + index_t typeMethod, index_t numSteps, index_t numRefine, index_t numElevate, - index_t numIncrease) + index_t numIncrease, + std::string& fn) : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numSteps), numRefine(numRefine), numElevate(numElevate), numIncrease(numIncrease), numSteps(numSteps), - tmethod(tmethod), + typeMethod(typeMethod), tstart(tstart), tstop(tstop), tstep( (tstop-tstart)/numSteps ), - patches(*gsNurbsCreator<>::BSplineSquareDeg(2)), - bases(patches), - g_D(0,2), g_N(1,2), - K(1,1), M(1,1), f(1,2) + K(1,1), M(1,1), + m_solver(new solver) { ///////////////////////////////////////////////////////////////////////////////////////////// // Code for heat equation starts here // ///////////////////////////////////////////////////////////////////////////////////////////// + + gsFileData fd(fn); + if (this->id() == 0) gsInfo << "Loaded file " << fd.lastPath() << "\n"; + + fd.getId(0, mp); // id=0: Multipatch domain + bases = gsMultiBasis(mp); - // Define geometry, must be a gsMultiPatch object - patches.computeTopology(); + fd.getId(1, f); // id=1: right-hand side function + if (this->id() == 0) gsInfo << "Source function " << f << "\n"; + + fd.getId(2, bc); // id=2: boundary conditions + if (this->id() == 0) gsInfo << "Boundary conditions:\n" << bc << "\n"; - // Boundary conditions - bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); - bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); - bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); + fd.getId(3, u0); // id=3: initial conditions + if (this->id() == 0) gsInfo << "Initial conditions:\n" << u0 << "\n"; + + fd.getId(4, ms); // id=4: manufactured solution + if (this->id() == 0) gsInfo << "Manufactured solution:\n" << ms << "\n"; + + fd.getId(5, Aopt); // id=5: assembler options + K.setOptions(Aopt); + M.setOptions(Aopt); // Elevate and p-refine the basis to order k + numElevate // where k is the highest degree in the bases @@ -106,7 +124,7 @@ class gsXBraid_app : public gsXBraid< gsVector > { // Find maximum degree with respect to all the variables int tmp = bases.maxDegree(0); - for (short_t j = 1; j < patches.parDim(); ++j ) + for (short_t j = 1; j < mp.parDim(); ++j ) if ( tmp < bases.maxDegree(j) ) tmp = bases.maxDegree(j); @@ -116,7 +134,7 @@ class gsXBraid_app : public gsXBraid< gsVector > } // Increase and p-refine the basis - if (numIncrease >0) + if (numIncrease > 0) bases.degreeIncrease(numIncrease); // h-refine the basis @@ -128,16 +146,16 @@ class gsXBraid_app : public gsXBraid< gsVector > M.setIntegrationElements(bases); // Set the geometry map - geometryMap G_K = K.getMap(patches); - geometryMap G_M = M.getMap(patches); + geometryMap G_K = K.getMap(mp); + geometryMap G_M = M.getMap(mp); // Set the discretization space space u_K = K.getSpace(bases); space u_M = M.getSpace(bases); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); - u_K.addBc( bcInfo.get("Dirichlet") ); - u_M.addBc( bcInfo.get("Dirichlet") ); + u_K.addBc( bc.get("Dirichlet") ); + u_M.addBc( bc.get("Dirichlet") ); // Set the source term variable ff_K = K.getCoeff(f, G_K); @@ -153,67 +171,73 @@ class gsXBraid_app : public gsXBraid< gsVector > // Enforce Neumann conditions to right-hand side variable g_Neumann = K.getBdrFunction(); - K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); if (this->id() == 0) { gsStopwatch clock; clock.restart(); - gsSparseSolver<>::CGDiagonal solver; sol.setZero(M.numDofs()); - switch((gsXBraid_tmethod)tmethod) { - case gsXBraid_tmethod::FE_FE: - case gsXBraid_tmethod::FE_BE: + switch((gsXBraid_typeMethod)typeMethod) { + case gsXBraid_typeMethod::FE_FE: + case gsXBraid_typeMethod::FE_BE: // Forward Euler method for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = solver.compute(M.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*K.matrix())*sol); - - gsInfo << "norm of the solution = " << sol.norm() << "\n" - << "wall time = " << clock.stop() << std::endl; + sol = m_solver->compute(M.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*sol); break; - - case gsXBraid_tmethod::BE_BE: + + case gsXBraid_typeMethod::BE_BE: // Backward Euler method for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = solver.compute(M.matrix() + - tstep*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix())*sol); - - gsInfo << "norm of the solution = " << sol.norm() << "\n" - << "wall time = " << clock.stop() << std::endl; + sol = m_solver->compute(M.matrix() + + tstep*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix())*sol); break; - - case gsXBraid_tmethod::CN_CN: - case gsXBraid_tmethod::CN_BE: + + case gsXBraid_typeMethod::CN_CN: + case gsXBraid_typeMethod::CN_BE: // Crank-Nicholson method for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = solver.compute(M.matrix() + - tstep*0.5*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*0.5*K.matrix())*sol); - - gsInfo << "norm of the solution = " << sol.norm() << "\n" - << "wall time = " << clock.stop() << std::endl; + sol = m_solver->compute(M.matrix() + + tstep*0.5*K.matrix() + ).solve(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*sol); break; - + default: throw std::runtime_error("Unsupported time-stepping method"); } + + gsInfo << "wall time = " << clock.stop() << "\n" + << "L2 norm of the solution = " << sol.norm() << "\n"; + + // gsExprEvaluator ev(M); + // solution u_sol = M.getSolution(u_M, sol); + // variable u_ex = ev.getVariable(ms, G_M); + // T l2err = math::sqrt( ev.integral( (u_ex - u_sol).sqNorm() * meas(G_M) ) ); + // T h1err = l2err + + // math::sqrt(ev.integral( ( igrad(u_ex) - grad(u_sol)*jac(G_M).inv() ).sqNorm() * meas(G_M) )); + + // gsInfo << "L2 error of the solution = " << l2err << "\n" + // << "H1 error of the solution = " << h1err << std::flush; } } /// Destructor - virtual ~gsXBraid_app() {} + virtual ~gsXBraid_app() + { + delete m_solver; + } /// Creates instance from command line argument static inline gsXBraid_app create(const gsMpiComm& comm, @@ -221,7 +245,7 @@ class gsXBraid_app : public gsXBraid< gsVector > char** argv) { // Problem parameters - std::string fn("pde/poisson2d_bvp.xml"); + std::string fn(XBRAID_DATA_DIR"pde/heat2d_square_ibvp.xml"); // Spatial discretisation parameters index_t numRefine = 2; @@ -230,7 +254,7 @@ class gsXBraid_app : public gsXBraid< gsVector > // Temporal discretisation parameters index_t numSteps = 40; - index_t tmethod = (index_t)gsXBraid_tmethod::CN_CN; + index_t typeMethod = (index_t)gsXBraid_typeMethod::CN_CN; T tfinal = 0.1; // Parallel-in-time multigrid parameters @@ -271,7 +295,7 @@ class gsXBraid_app : public gsXBraid< gsVector > // Temporal diescretisation parameters cmd.addInt( "n", "numSteps", "Number of parallel-in-time steps", numSteps ); - cmd.addInt( "T", "tmethod", "Time-stepping scheme", tmethod); + cmd.addInt( "T", "typeMethod", "Time-stepping scheme", typeMethod); cmd.addReal( "t", "tfinal", "Final time", tfinal ); // Parallel-in-time multigrid parameters @@ -301,7 +325,7 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.getValues(argc,argv); // Create instance - gsXBraid_app app(comm, 0.0, tfinal, tmethod, numSteps, numRefine, numElevate, numIncrease); + gsXBraid_app app(comm, 0.0, tfinal, typeMethod, numSteps, numRefine, numElevate, numIncrease, fn); if (absTol != 1e-10) app.SetAbsTol(absTol); @@ -338,6 +362,9 @@ class gsXBraid_app : public gsXBraid< gsVector > /// Initializes a vector braid_Int Init(braid_Real t, braid_Vector *u_ptr) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif { gsVector* u = new gsVector(M.numDofs()); @@ -358,6 +385,9 @@ class gsXBraid_app : public gsXBraid< gsVector > braid_Vector ustop, braid_Vector fstop, BraidStepStatus &pstatus) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif { gsVector* u_ptr = (gsVector*) u; gsVector* ustop_ptr = (gsVector*) ustop; @@ -373,67 +403,64 @@ class gsXBraid_app : public gsXBraid< gsVector > static_cast(pstatus).timeInterval(); T tstep(time.second - time.first); - // Solve spatial problem - gsSparseSolver<>::CGDiagonal solver; - - switch((gsXBraid_tmethod)tmethod) { - case gsXBraid_tmethod::FE_FE: + switch((gsXBraid_typeMethod)typeMethod) { + case gsXBraid_typeMethod::FE_FE: // Forward Euler method (all grids) - *u_ptr = solver.compute(M.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*K.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); break; - case gsXBraid_tmethod::FE_BE: + case gsXBraid_typeMethod::FE_BE: if (static_cast(pstatus).level() == 0) { // Forward Euler method (fine grid) - *u_ptr = solver.compute(M.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*K.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*(*u_ptr), + *ustop_ptr); } else { // Backward Euler method (coarse grids) - *u_ptr = solver.compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); } break; - case gsXBraid_tmethod::BE_BE: + case gsXBraid_typeMethod::BE_BE: // Backward Euler method (all grids) - *u_ptr = solver.compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); break; - case gsXBraid_tmethod::CN_CN: + case gsXBraid_typeMethod::CN_CN: // Crank-Nicholson method (all grids) - *u_ptr = solver.compute(M.matrix() + - tstep*0.5*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + + tstep*0.5*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); break; - case gsXBraid_tmethod::CN_BE: + case gsXBraid_typeMethod::CN_BE: if (static_cast(pstatus).level() == 0) { - *u_ptr = solver.compute(M.matrix() + - tstep*0.5*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + + tstep*0.5*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); } else { // Backward Euler method (coarse grids) - *u_ptr = solver.compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver->compute(M.matrix() + + tstep*K.matrix() + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*(*u_ptr), + *ustop_ptr); } break; @@ -457,6 +484,9 @@ class gsXBraid_app : public gsXBraid< gsVector > /// Sets the size of the MPI communication buffer braid_Int BufSize(braid_Int *size_ptr, BraidBufferStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif { *size_ptr = sizeof(T)*(M.numDofs()+2); return braid_Int(0); @@ -465,6 +495,9 @@ class gsXBraid_app : public gsXBraid< gsVector > /// Handles access for input/output braid_Int Access(braid_Vector u, BraidAccessStatus &astatus) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif { if(static_cast(astatus).done() && static_cast(astatus).timeIndex() == @@ -476,27 +509,35 @@ class gsXBraid_app : public gsXBraid< gsVector > } /// Performs spatial coarsening - virtual int Coarsen(braid_Vector fu, - braid_Vector *cu_ptr, - BraidCoarsenRefStatus &status) { + braid_Int Coarsen(braid_Vector fu, + braid_Vector *cu_ptr, + BraidCoarsenRefStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { gsInfo << "Coarsen\n"; gsVector *fu_ptr = (gsVector*) fu; gsVector* cu = new gsVector(); *cu = *fu_ptr; *cu_ptr = (braid_Vector) cu; - return 0; + return braid_Int(0); } // Performs spatial refinement - virtual int Refine(braid_Vector cu, - braid_Vector *fu_ptr, - BraidCoarsenRefStatus &status) { + braid_Int Refine(braid_Vector cu, + braid_Vector *fu_ptr, + BraidCoarsenRefStatus &status) +#if __cplusplus >= 201103L || _MSC_VER >= 1600 + override +#endif + { gsInfo << "Refine\n"; gsVector *cu_ptr = (gsVector*) cu; gsVector* fu = new gsVector(); *fu = *cu_ptr; *fu_ptr = (braid_Vector) fu; - return 0; + return braid_Int(0); } }; diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml new file mode 100644 index 0000000000..f6d97ead52 --- /dev/null +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml @@ -0,0 +1,69 @@ + + + + + + 100 100 + + 100 1 + 100 2 + 100 3 + 100 4 + + + + + 1 + + + + 0 + 1 + + + + 0 1 0 2 0 3 + + + + + 0 4 + + + + + 0 + + + 0 + + + + + + + + + + + + + + + + + + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + + From d1343b5665c0bd3c5d8b8013cef9d555fd2830c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 6 May 2021 00:10:21 +0200 Subject: [PATCH 030/174] Update README.md --- extensions/gsXBraid/README.md | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/extensions/gsXBraid/README.md b/extensions/gsXBraid/README.md index a31f71147f..cc0c8ea876 100644 --- a/extensions/gsXBraid/README.md +++ b/extensions/gsXBraid/README.md @@ -62,15 +62,23 @@ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of th ``` 2. Execution (MPI-only mode) ```bash - mpirun -np ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + mpirun -np --hostfile ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 ``` This will solve the two-dimensional heat equation on a unit square with 250 time steps in the time interval [0, 0.1] using - MPI processes. The spatial domain is 6 times regularly refined in - space (h-refinement) and the approximation order is increased 3 - times (p-refinement). Order elevation instead of order increase - can be achieved by replacing `-i` by `-e`. + MPI processes. The `hostfile` should have the following structure + + ```text + node0 slots=#slots max_slots=#maximum slots + node1 slots=#slots max_slots=#maximum slots + ... + ``` + + The spatial domain is 6 times regularly refined in space (h-refinement) + and the approximation order is increased 3 times (p-refinement). + Order elevation instead of order increase can be achieved by replacing + the switch`-i` by `-e`. For a complete list of command-line argument run ```bash @@ -87,18 +95,8 @@ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of th 4. Execution (MPI-OpenMP mode) ```bash - OMP_NUM_THREADS= mpirun -np ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + mpirun -np --hostfile -x OMP_NUM_THREADS= ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 ``` - This will solve the two-dimensional heat equation on a unit square - with 250 time steps in the time interval [0, 0.1] using - MPI processes and OpenMP threads per MPI process. - As before, the spatial domain is 6 times regularly refined in - space (h-refinement) and the approximation order is increased 3 - times (p-refinement). Order elevation instead of order increase - can be achieved by replacing `-i` by `-e`. - - For a complete list of command-line argument run - ```bash - ./bin/xbraid_heatEquation_example -h - ``` + The additional parameter `-x OMP_NUM_THREADS=` ensures that + each MPI process executes `NTHREAD` OpenMP threads in parallel. From 83c610f792988f20ab9108dbee4f8f8ec6a49612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 6 May 2021 14:56:41 +0200 Subject: [PATCH 031/174] Update README.md --- extensions/gsXBraid/README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/extensions/gsXBraid/README.md b/extensions/gsXBraid/README.md index cc0c8ea876..b64311843c 100644 --- a/extensions/gsXBraid/README.md +++ b/extensions/gsXBraid/README.md @@ -54,13 +54,16 @@ __Usage example__ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of the gsXBraid extension. 1. Configuration and compilation (MPI-only mode) + ```bash mkdir build cd build cmake .. -DGISMO_WITH_XBRAID=ON -DGISMO_WITH_MPI=ON make xbraid_heatEquation_example -j4 ``` + 2. Execution (MPI-only mode) + ```bash mpirun -np --hostfile ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 ``` @@ -86,6 +89,7 @@ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of th ``` 3. Configuration and compilation (MPI-OpenMP mode) + ```bash mkdir build cd build @@ -94,9 +98,16 @@ The file ```xbraid_heatEquation_example.cpp``` illustrates the basic usage of th ``` 4. Execution (MPI-OpenMP mode) + ```bash mpirun -np --hostfile -x OMP_NUM_THREADS= ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 ``` The additional parameter `-x OMP_NUM_THREADS=` ensures that - each MPI process executes `NTHREAD` OpenMP threads in parallel. + each MPI process executes `NTHREAD` OpenMP threads in parallel. The `-x` + flag is not supported by all MPI implementations. If it does not work + try + + ```bash + mpirun -np --hostfile -env OMP_NUM_THREADS ./bin/xbraid_heatEquation_example -n 250 -r 6 -i 3 + ``` From 385053a7928a8ed283a1dbd59fa841e4224d0eb5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 6 May 2021 16:15:43 +0200 Subject: [PATCH 032/174] Updated XBraid example to use XML file for solver configuration --- .../gsXBraid/examples/gsXBraidMultigrid.h | 1129 +++++++++++++++++ .../examples/xbraid_heatEquation_example.cpp | 174 ++- .../filedata/pde/heat2d_square_ibvp.xml | 69 - .../filedata/pde/heat2d_square_ibvp1.xml | 110 ++ .../filedata/pde/heat2d_square_ibvp2.xml | 108 ++ .../IterativeLinearSolvers/IncompleteLUT.h | 7 +- 6 files changed, 1425 insertions(+), 172 deletions(-) create mode 100644 extensions/gsXBraid/examples/gsXBraidMultigrid.h delete mode 100644 extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml create mode 100644 extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml create mode 100644 extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml diff --git a/extensions/gsXBraid/examples/gsXBraidMultigrid.h b/extensions/gsXBraid/examples/gsXBraidMultigrid.h new file mode 100644 index 0000000000..f4b006a3ae --- /dev/null +++ b/extensions/gsXBraid/examples/gsXBraidMultigrid.h @@ -0,0 +1,1129 @@ +#include +#include + +namespace gismo { + +/** @brief The p-multigrid base class provides the basic + * methods (smoothing, prolongation, restriction) for + * implementing p-multigrid methods + */ + +template +struct gsXBraidMultigridBase +{ + +public: + + /// @brief Apply p-multigrid solver to given right-hand side on level l + virtual void solve(const gsMatrix & rhs, + std::vector > > m_basis, + gsMatrix& x, + const int& numLevels, + const int& numCoarsening, + const int& numRefine, + const int& numSmoothing, + int& numCoarseCycles, + const int& typeCycle_p, + int& typeCycle_h, + const int& typeSolver, + const int& typeBCHandling, + gsBoundaryConditions bcInfo, + gsMultiPatch<> mp, + gsGeometry<>::Ptr geo, + const int& typeLumping, + const int& typeProjection, + const int& typeSmoother, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix<>& hp) + { + if( numLevels == 1) + { + solvecoarse(rhs, x, numLevels); + return; + } + + + if(hp(std::max(numLevels-2,0),0) == 0 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, numSmoothing, fineRes, numRefine, typeSmoother,hp); + restriction(fineRes, coarseRes, numLevels, numCoarsening, m_basis, typeLumping, + typeBCHandling, bcInfo, mp, geo, typeProjection, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int j = 0 ; j < (typeCycle_p == 2 ? 2 : 1) ; j++) + { + solve(coarseRes, m_basis, coarseCorr, numLevels-1, numCoarsening, numRefine, + numSmoothing, numCoarseCycles, typeCycle_p, typeCycle_h, typeSolver, + typeBCHandling, bcInfo, mp, geo, typeLumping, typeProjection, typeSmoother, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, numCoarsening, m_basis, typeLumping, + typeBCHandling, bcInfo, mp, geo, typeProjection, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs, x, numLevels, numSmoothing, fineCorr, postRes, typeSolver, + numRefine, typeSmoother, hp); + } + + if(hp(std::max(numLevels-2,0),0) == 1 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, numSmoothing, fineRes, numRefine, typeSmoother, hp); + restriction(fineRes, coarseRes, numLevels, numCoarsening, m_basis, typeLumping, + typeBCHandling, bcInfo, mp, geo, typeProjection, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int i = 0 ; i < (typeCycle_h == 2 ? 2 : 1) ; i++) + { + solve(coarseRes, m_basis, coarseCorr, numLevels-1, numCoarsening, numRefine, + numSmoothing, numCoarseCycles, typeCycle_p, typeCycle_h, typeSolver, + typeBCHandling, bcInfo, mp, geo, typeLumping, typeProjection, typeSmoother, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, numCoarsening, m_basis, typeLumping, + typeBCHandling, bcInfo, mp, geo, typeProjection, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs,x, numLevels, numSmoothing, fineCorr, postRes, typeSolver, + numRefine, typeSmoother, hp); + } + } + + /// @brief Setup p-multigrid to given linear system + virtual void setup(const gsMatrix & rhs, + std::vector > > m_basis, + gsMatrix& x, + const int& numLevels, + const int& numCoarsening, + const int& numRefine, + const int& numSmoothing, + int& numCoarseCycles, + const int& typeCycle_p, + const int& typeCycle_h, + const int& typeSolver, + const int& typeBCHandling, + gsBoundaryConditions bcInfo, + gsMultiPatch<> mp, + gsGeometry<>::Ptr geo, + const int& typeLumping, + const int& typeProjection, + const int& typeSmoother, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix<>& hp) {} + + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + const int& numSmoothing, + gsMatrix & fineRes , + const int& numRefine, + const int& typeSmoother, + const gsMatrix<>& hp) = 0; + + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void postsmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + const int& numSmoothing, + gsMatrix & fineCorr, + gsMatrix & postRes, + const int& typeSolver, + const int& numRefine, + const int& typeSmoother, + const gsMatrix<>& hp) = 0; + + /// @brief Apply coarse solver (pure virtual method) + virtual void solvecoarse(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix prolongation_M(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) = 0; + + /// @brief Prolongate coarse space function to fine space + virtual void prolongation(const gsMatrix& Xcoarse, + gsMatrix& Xfine, + const int& numLevels, + const int& numCoarsening, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsBoundaryConditions bcInfo, + gsMultiPatch<> mp, + gsGeometry<>::Ptr geo, + const int& typeProjection, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix<>& hp) + { + if(hp(numLevels-2,0) == 1) + { + Xfine = m_prolongation_H[numLevels-2]*Xcoarse; + } + else + { + if(typeLumping == 1) + { + gsVector<> temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsMatrix<> M_L_inv = (m_prolongation_M[numLevels-2]).array().inverse(); + Xfine = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + + // Determine matrix M (high_order * high_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basisH ,1, 0); + w_n.setInterfaceCont(0); + if(typeBCHandling == 1) + { + w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basisH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); + + // Prolongate Xcoarse to Xfine + gsVector<> temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsSparseMatrix<> M = ex2.matrix(); + gsConjugateGradient<> CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp,Xfine); + } + } + } + + /// @brief Restrict fine space function to coarse space + virtual void restriction(const gsMatrix& Xfine, + gsMatrix& Xcoarse, + const int& numLevels, + const int& numCoarsening, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsBoundaryConditions bcInfo, + gsMultiPatch<> mp, + gsGeometry<>::Ptr geo, + const int& typeProjection, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix<>& hp) + { + if(hp(numLevels-2,0) == 1) + { + Xcoarse = m_restriction_H[numLevels-2]*Xfine; + } + else + { + if(typeLumping == 1) + { + // Standard way + gsVector<> temp = m_restriction_P[numLevels-2]*Xfine; + gsMatrix<> M_L_inv = (m_restriction_M[numLevels-2]).array().inverse(); + Xcoarse = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + + // Determine matrix M (low_order * low_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basisL, 1, 0); + w_n.setInterfaceCont(0); + if(typeBCHandling == 1) + { + w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basisL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); + + // Restrict Xfine to Xcoarse + gsMatrix<> temp = m_restriction_P[numLevels-2]*Xfine; + gsSparseMatrix<> M = ex2.matrix(); + gsConjugateGradient<> CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp, Xcoarse); + } + } + } +}; + +/** @brief The p-multigrid class implements a generic p-multigrid solver + * that can be customized by passing assembler and coarse + * solver as template arguments. + * + * @note: This implementation assumes that all required prolongation/ + * restriction operators are generated internally. Therefore, a + * problem-specific assembler has to be passed as template argument. + */ +template +struct gsXBraidMultigrid : public gsXBraidMultigridBase +{ +private: + + /// Base class type + typedef gsXBraidMultigridBase Base; + + /// Shared pointer to multi-patch geometry + memory::shared_ptr > m_mp_ptr; + + /// Shared pointer to boundary conditions + memory::shared_ptr > m_bcInfo_ptr; + + /// std::vector of multi-basis objects + std::vector > > m_basis; + + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_P; + + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_P; + + /// std::vector of prolongation operators + std::vector< gsMatrix > m_prolongation_M; + + /// std::vector of restriction operators + std::vector< gsMatrix > m_restriction_M; + + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_H; + + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_H; + + /// std::vector of factorized operators + std::vector< std::vector< gsSparseMatrix > > m_ILUT; + + /// std::vector of factorized operators + std::vector< std::vector < Eigen::PermutationMatrix > > m_P; + + /// std::vector of factorized operators + std::vector < std::vector < Eigen::PermutationMatrix > > m_Pinv; + + /// std::vector of SCM smoother object + std::vector< gsPreconditionerOp<>::Ptr > m_SCMS; + + /// std::vector of operator objects + std::vector< gsSparseMatrix > m_operator; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector< gsSparseMatrix > > m_block_operator; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddB; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddC; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddBtilde; + + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddCtilde; + + /// std::vector of std::vector of block operator objects + std::vector < gsMatrix > m_A_aprox; + + /// std::vector of std::vector of block operator objects + std::vector < gsSparseMatrix > m_S; + + /// std::vector of std::vector of shift objects + std::vector < std::vector< int > > m_shift; + + /// std::vector of assembler objects + std::vector m_assembler; + +public: + + // Constructor + gsXBraidMultigrid(const gsMultiPatch & mp, + const gsMultiBasis & basis, + const gsBoundaryConditions & bcInfo) + { + m_mp_ptr = memory::make_shared_not_owned(&mp); + m_bcInfo_ptr = memory::make_shared_not_owned(&bcInfo); + m_basis.push_back(memory::make_shared_not_owned(&basis)); + } + +public: + + /// @brief Set-up p-multigrid solver + void setup(const gsFunctionExpr & rhs, + const gsFunctionExpr & sol_exact, + gsMatrix& x, + const int& numSmoothing, + gsMatrix f, + const int& typeSolver, + int& iterTot, + int& typeCycle_p, + int& typeCycle_h, + int numLevels, + const int& numCoarsening, + const int& numDegree, + const int& numRefine, + const int& numBenchmark, + const int& typeMultigrid, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeLumping, + const gsMatrix<>& hp, + const int& typeProjection, + const int& typeSmoother, + const int& typeCoarseOperator, + const gsFunctionExpr<> coeff_diff, + const gsFunctionExpr<> coeff_conv, + const gsFunctionExpr<> coeff_reac) + { + for (int i = 1; i < numLevels; i++) + { + m_basis.push_back(give(m_basis.back()->clone())); + switch((int) hp(i-1,0) ) + { + case 0 : (typeProjection == 1 ? + m_basis.back()->degreeIncrease(numDegree-1) : + m_basis.back()->degreeIncrease()); break; + + case 1 : m_basis.back()->uniformRefine(); break; + + case 2: m_basis.back()->uniformRefine(); + m_basis.back()->degreeIncrease(); break; + } + } + + // Generate sequence of assembler objects and assemble + for (typename std::vector > >::iterator it = m_basis.begin(); + it != m_basis.end(); ++it) + { + m_assembler.push_back(Assembler(*m_mp_ptr, + *(*it).get(), + *m_bcInfo_ptr, + rhs, + coeff_diff, + coeff_conv, + coeff_reac, + (typeBCHandling == 1 ? + dirichlet::elimination : + dirichlet::nitsche), + iFace::glue)); + } + + // Resize vector of operators + m_operator.resize(numLevels); + m_prolongation_P.resize(numLevels-1); + m_prolongation_M.resize(numLevels-1); + m_prolongation_H.resize(numLevels-1); + m_restriction_P.resize(numLevels-1); + m_restriction_M.resize(numLevels-1); + m_restriction_H.resize(numLevels-1); + + // Assemble operators at finest level + gsStopwatch clock; + gsInfo << "|| Multigrid hierarchy ||" <degree() << ", Ndof: " << m_basis[i]->totalSize() <degree() <totalSize() < transferMatrix; + gsOptionList options; + typeBCHandling == 1 ? options.addInt("DirichletStrategy","",dirichlet::elimination) : options.addInt("DirichletStrategy","",dirichlet::nitsche); + for(int i = 1; i < numLevels; i++) + { + if(hp(i-1,0) == 1) + { + gsMultiBasis m_basis_copy = *m_basis[i]; + m_basis_copy.uniformCoarsen_withTransfer(transferMatrix,*m_bcInfo_ptr,options); + m_prolongation_H[i-1] = transferMatrix; + m_restriction_H[i-1] = m_prolongation_H[i-1].transpose(); + } + } + real_t Time_Transfer = clock.stop(); + + // Obtain operators with Galerkin projection + clock.restart(); + if(typeCoarseOperator == 2) + { + for (int i = numLevels-1; i > -1; i--) + { + if(hp(hp.rows()-1,0) == 0) + { + if(hp(std::min(i,hp.rows()-1),0) == 1) + { + m_operator[i] = m_restriction_H[i]*m_operator[i+1]*m_prolongation_H[i]; + } + } + else + { + if(hp(std::min(i,hp.rows()-1),0) == 1 && i > 0) + { + m_operator[i-1] = m_restriction_H[i-1]*m_operator[i]*m_prolongation_H[i-1]; + } + } + } + } + real_t Time_Assembly_Galerkin = clock.stop(); + + + // Setting up the subspace corrected mass smoother + clock.restart(); + if(typeSmoother == 3) + { + // Generate sequence of SCM smoothers + m_SCMS.resize(numLevels); + gsOptionList opt; + opt.addReal("Scaling","",0.12); + for(int i = 0 ; i < numLevels ; i++) + { + m_SCMS[i] = setupSubspaceCorrectedMassSmoother(m_operator[i], *m_basis[i], *m_bcInfo_ptr, opt, typeBCHandling); + } + } + real_t Time_SCMS = clock.stop(); + + // Determine ILUT factorizations at each level + clock.restart(); + int numPatch = m_mp_ptr->nPatches(); + + if(typeSmoother == 1) + { + // Generate factorizations (ILUT) + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + for(int i = 0; i < numLevels; i++) + { + m_ILUT[i].resize(1); + m_P[i].resize(1); + m_Pinv[i].resize(1); + if(typeProjection == 2) + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + else + { + if(i == numLevels-1) // Only at finest level + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + } + } + } + real_t Time_ILUT_Factorization = clock.stop(); + clock.restart(); + if(typeSmoother == 5) + { + int shift0 = 0; + m_ddB.resize(numLevels); + m_ddC.resize(numLevels); + m_ddBtilde.resize(numLevels); + m_ddCtilde.resize(numLevels); + + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + m_shift.resize(numLevels); + m_S.resize(numLevels); + + for(int i = 0 ; i < numLevels ; i++) + { + m_shift[i].resize(numPatch+1); + m_ILUT[i].resize(numPatch+1); + m_P[i].resize(numPatch+1); + m_Pinv[i].resize(numPatch+1); + + // Use of partition functions + std::vector > interior, boundary; + std::vector > > interface; + std::vector > global_interior, global_boundary; + std::vector > > global_interface; + //m_basis[i]->partition(interior,boundary,interface,global_interior,global_boundary,global_interface); + for(int l=0; l< numPatch; l++) + { + m_shift[i][l] = global_interior[l].rows(); + } + m_shift[i][numPatch] = 0; + m_shift[i][numPatch] = m_operator[i].rows() - accumulate(m_shift[i].begin(),m_shift[i].end(),0); + + // Put shift on zero + shift0 = 0; + for(int j = 0 ; j < numPatch ; j++) + { + const gsSparseMatrix<> block = m_operator[i].block(shift0,shift0,m_shift[i][j],m_shift[i][j]); + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(block); + m_ILUT[i][j] = ilu.m_lu; + + m_P[i][j] = ilu.m_P; + m_Pinv[i][j] = ilu.m_Pinv; + shift0 = shift0 + m_shift[i][j]; + + } + + shift0 = 0; + // Obtain the blocks of the matrix + m_ddB[i].resize(numPatch+1); + m_ddC[i].resize(numPatch+1); + + for(int j = 0 ; j < numPatch+1 ; j++) + { + m_ddB[i][j] = m_operator[i].block(m_operator[i].rows()-m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][j]); + m_ddC[i][j] = m_operator[i].block(shift0,m_operator[i].cols()-m_shift[i][numPatch],m_shift[i][j],m_shift[i][numPatch]); + shift0 = shift0 + m_shift[i][j]; + } + shift0 = 0; + } + + m_A_aprox.resize(numLevels); + for(int i = 0 ; i < numLevels ; i++) + { + // Define the A_aprox matrix + m_A_aprox[i] = gsSparseMatrix<>(m_operator[i].rows(),m_operator[i].cols()); + + // Retrieve a block of each patch + for(int k=0; k< numPatch; k++) + { + m_A_aprox[i].block(shift0,shift0,m_shift[i][k],m_shift[i][k]) = m_ILUT[i][k]; + shift0 = shift0 + m_shift[i][k]; + } + shift0 = 0; + m_ddBtilde[i].resize(numPatch); + m_ddCtilde[i].resize(numPatch); + + for(int j=0 ; j < numPatch ; j ++) + { + m_ddBtilde[i][j] = gsSparseMatrix<>(m_shift[i][j],m_shift[i][numPatch]); + m_ddCtilde[i][j] = gsSparseMatrix<>(m_shift[i][j],m_shift[i][numPatch]); + for(int k=0 ; k < m_shift[i][numPatch]; k++) + { + gsMatrix<> Brhs = m_ddC[i][j].col(k); + gsMatrix<> Crhs = m_ddC[i][j].col(k); + m_ddBtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().transpose().solve(Brhs); + m_ddCtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().solve(Crhs); + } + } + + // Define matrix S + m_S[i] = m_ddC[i][numPatch]; + for(int l = 0 ; l < numPatch ; l++) + { + m_S[i] = m_S[i] - m_ddBtilde[i][l].transpose()*m_ddCtilde[i][l]; + } + + // Fill matrix A_aprox + for(int m = 0 ; m < numPatch ; m++) + { + m_A_aprox[i].block(shift0,m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][m],m_shift[i][numPatch]) = m_ddCtilde[i][m]; + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][m]) = m_ddBtilde[i][m].transpose(); + shift0 = shift0 + m_shift[i][m]; + } + shift0 = 0; + + // Preform ILUT on the S-matrix! + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + gsSparseMatrix<> II = m_S[i]; + ilu.compute(II); + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][numPatch],m_shift[i][numPatch]) = ilu.m_lu; + } + } + + real_t Time_Block_ILUT_Factorization = clock.stop(); + gsInfo << "\n|| Setup Timings || " < & rhs, + const gsFunctionExpr & sol_exact, + gsMatrix& x, + const int& numSmoothing, + gsMatrix f, + const int& typeSolver, + int& iterTot, + int& typeCycle_p, + int& typeCycle_h, + int numLevels, + const int& numCoarsening, + const int& numDegree, + const int& numRefine, + const int& numBenchmark, + const int& typeMultigrid, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeLumping, + const gsMatrix<>& hp, + const int& typeProjection, + const int& typeSmoother, + const int& typeCoarseOperator) + { + gsStopwatch clock; + + if(typeSolver == 1) + { + x = gsMatrix<>::Random(m_operator[numLevels-1].rows(),1); + } + + gsMatrix<> b; + typeSolver == 1 ? b = m_assembler.back().rhs() : b = f; + + + // Determine residual and L2 error + real_t r0 = (m_operator[numLevels-1]*x - b).norm(); + real_t r = r0; + real_t tol = 1e-8; + int iter = 1; + int numCoarseCycles = 0; + + // Solve with p-multigrid method + real_t r_old = r0; + clock.restart(); + while( (typeSolver == 1 || typeSolver == 5) ? r/r0 > tol && iter < 100000 : iter < 2) + { + // Call solver from base class + Base::solve(b, m_basis, x, numLevels, numCoarsening, numRefine, numSmoothing, numCoarseCycles, + typeCycle_p, typeCycle_h, typeSolver, typeBCHandling, *m_bcInfo_ptr, *m_mp_ptr, geo, + typeLumping, typeProjection, typeSmoother, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + numCoarseCycles = 0; + r = (m_operator[numLevels-1]*x - b).norm(); + if( r_old < r) + { + gsInfo << "Residual increased during solving!!! " < solMG = m_assembler.back().constructSolution(x); + // gsNormL2 L2Norm(solMG,sol_exact); + // real_t errorL2 = L2Norm.compute(); + // gsInfo << "Residual after solving: " << r <(solMG, "Multigrid_solution", 100*x.rows()); + // gsField<> Exact( *m_mp_ptr, sol_exact, false ); + // gsWriteParaview<>( Exact, "Exact_solution", 100*x.rows()); + } + } + +private: + + /// @brief Apply coarse solver + virtual void solvecoarse(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels) + { + gsInfo << "Coarse solver is applied! " < prolongation_M(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + + // Determine matrix M (high_order * high_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basisH ,1, 0); + w_n.setInterfaceCont(0); + if(typeBCHandling == 1) + { + w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basisH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct prolongation operator at level numLevels + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + + // Determine matrix P (high_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + gsExprAssembler ex(1,1); + geometryMap G = ex.getMap(*m_mp_ptr); + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basisH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basisL); + u_n.setInterfaceCont(0); + if(typeBCHandling == 1) + { + v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basisH); + ex.initSystem(); + ex.assemble(u_n*meas(G) * v_n.tr()); + gsSparseMatrix<> P = ex.matrix().transpose(); + return P; + } + + /// @brief Construct restriction operator at level numLevels + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + + // Determine matrix M (low_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basisL ,1, 0); + w_n.setInterfaceCont(0); + if(typeBCHandling == 1) + { + w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basisL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct restriction operator at level numLevels + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_basis, + const int& typeLumping, + const int& typeBCHandling, + gsGeometry<>::Ptr geo, + const int& typeProjection) + { + // Define the low and high order basis + gsMultiBasis<> basisL = *m_basis[numLevels-2]; + gsMultiBasis<> basisH = *m_basis[numLevels-1]; + + // Determine matrix P (high_order * low_order) + gsExprAssembler ex(1,1); + typedef gsExprAssembler::geometryMap geometryMap; + geometryMap G = ex.getMap(*m_mp_ptr); + + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basisH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basisL); + u_n.setInterfaceCont(0); + if( typeBCHandling == 1) + { + u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basisH); + ex.initSystem(); + ex.assemble(u_n * meas(G)* v_n.tr()); + gsSparseMatrix<> P = ex.matrix(); + return P; + } + + /// @brief Apply fixed number of presmoothing steps + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + const int& numSmoothing, + gsMatrix & fineRes, + const int& numRefine, + const int& typeSmoother, + const gsMatrix<>& hp) + { + gsInfo << "Residual before presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix<> d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if(typeSmoother == 2) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + if(typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if(typeSmoother == 5) + { + if(hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + else + { + gsMatrix<> e; + gsMatrix<> d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + } + // gsInfo << "Residual after presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels <& rhs, + gsMatrix& x, + const int& numLevels, + const int& numSmoothing, + gsMatrix & fineCorr, + gsMatrix & postRes, + const int& typeSolver, + const int& numRefine, + const int& typeSmoother, + const gsMatrix<>& hp) + { + real_t alpha = 1; + x = x - alpha*fineCorr; + gsInfo << "Residual before postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix<> d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if(typeSmoother == 2) + { + ( typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); + } + if(typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if(typeSmoother == 5) + { + if(hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + ( typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); + } + else + { + gsMatrix<> e; + gsMatrix<> d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + postRes = rhs - m_operator[numLevels-1]*x; + // gsInfo << "Residual after postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < +struct gsXBraidMultigrid : public gsXBraidMultigridBase +{ + // Default constructor + gsXBraidMultigrid() + { + gsInfo << "The specific case"; + } +}; + +} // namespace gismo diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index b8afba7b80..7fe1d3108f 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -13,6 +13,7 @@ #include #include +#include "gsXBraidMultigrid.h" using namespace gismo; @@ -22,11 +23,11 @@ namespace gismo { enum class gsXBraid_typeMethod { - FE_FE = 1, // forward Euler (all grids) - BE_BE = 2, // backward Euler (all grids) - CN_CN = 3, // Crank-Nicholson (all grids) - FE_BE = 4, // forward Euler (fine grid), backward Euler (coarser grids) - CN_BE = 5 // Crank-Nicholson (fine grid), backward Euler (coarser grids) + FE_FE = 0, // forward Euler (all grids) + BE_BE = 1, // backward Euler (all grids) + CN_CN = 2, // Crank-Nicholson (all grids) + FE_BE = 3, // forward Euler (fine grid), backward Euler (coarser grids) + CN_BE = 4 // Crank-Nicholson (fine grid), backward Euler (coarser grids) }; /** @@ -43,15 +44,15 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numSteps, typeMethod; T tstart, tstop, tstep; - // Spatial discretization + // Spatial discretizations gsMultiPatch mp; - gsMultiBasis bases; + gsMultiBasis basisH, basisL; // Boundary conditions gsBoundaryConditions bc; // Assembler options - gsOptionList Aopt; + gsOptionList Aopt, Sopt, Topt; // Expression assembler gsExprAssembler K, M; @@ -60,9 +61,14 @@ class gsXBraid_app : public gsXBraid< gsVector > // Solution gsVector sol; - // Solver - typedef typename gsSparseSolver<>::CGDiagonal solver; + // Single-grid solver + typedef typename gsSparseSolver::CGDiagonal solver; solver* m_solver; + + // Multigrid solver + typedef typename gsSparseSolver::LU lu; + gsXBraidMultigrid >* m_mgsolver; + gsMatrix hp; typedef typename gsExprAssembler::geometryMap geometryMap; typedef typename gsExprAssembler::variable variable; @@ -100,7 +106,8 @@ class gsXBraid_app : public gsXBraid< gsVector > if (this->id() == 0) gsInfo << "Loaded file " << fd.lastPath() << "\n"; fd.getId(0, mp); // id=0: Multipatch domain - bases = gsMultiBasis(mp); + basisH = gsMultiBasis(mp); + basisL = gsMultiBasis(mp); fd.getId(1, f); // id=1: right-hand side function if (this->id() == 0) gsInfo << "Source function " << f << "\n"; @@ -115,43 +122,83 @@ class gsXBraid_app : public gsXBraid< gsVector > if (this->id() == 0) gsInfo << "Manufactured solution:\n" << ms << "\n"; fd.getId(5, Aopt); // id=5: assembler options + if (this->id() == 0) gsInfo << "Assembler options:\n" << Aopt << "\n"; K.setOptions(Aopt); M.setOptions(Aopt); + fd.getId(6, Topt); // id=6: multigrid-in-time options + if (this->id() == 0) gsInfo << "Multigrid-in-time options:\n" << Topt << "\n"; + + this->SetCFactor(Topt.getInt("CFactor")); + this->SetMaxIter(Topt.getInt("maxIter")); + this->SetMaxLevels(Topt.getInt("maxLevel")); + this->SetMaxRefinements(Topt.getInt("numMaxRef")); + this->SetMinCoarse(Topt.getInt("minCLevel")); + this->SetNFMG(Topt.getInt("numFMG")); + this->SetNFMGVcyc(Topt.getInt("numFMGVcyc")); + this->SetNRelax(Topt.getInt("numRelax")); + this->SetAccessLevel(Topt.getInt("access")); + this->SetPrintLevel(Topt.getInt("print")); + this->SetStorage(Topt.getInt("numStorage")); + this->SetTemporalNorm(Topt.getInt("norm")); + + if (Topt.getInt("tol") == 1) + this->SetAbsTol(Topt.getReal("absTol")); + else + this->SetRelTol(Topt.getReal("relTol")); + + if (Topt.getSwitch("fmg")) this->SetFMG(); + if (Topt.getSwitch("incrMaxLevels")) this->SetIncrMaxLevels(); + if (Topt.getSwitch("periodic")) this->SetPeriodic(1); else this->SetPeriodic(0); + if (Topt.getSwitch("refine")) this->SetRefine(1); else this->SetRefine(0); + if (Topt.getSwitch("sequential")) this->SetSeqSoln(1); else this->SetSeqSoln(0); + if (Topt.getSwitch("skip")) this->SetSkip(1); else this->SetSkip(0); + if (Topt.getSwitch("spatial")) this->SetSpatialCoarsenAndRefine(); + + fd.getId(7, Sopt); // id=6: spatial solver options + if (this->id() == 0) gsInfo << "Spatial solver options:\n" << Sopt << "\n"; + // Elevate and p-refine the basis to order k + numElevate - // where k is the highest degree in the bases + // where k is the highest degree in the basisH if ( numElevate > -1 ) { // Find maximum degree with respect to all the variables - int tmp = bases.maxDegree(0); + int tmp = basisH.maxDegree(0); for (short_t j = 1; j < mp.parDim(); ++j ) - if ( tmp < bases.maxDegree(j) ) - tmp = bases.maxDegree(j); + if ( tmp < basisH.maxDegree(j) ) + tmp = basisH.maxDegree(j); // Elevate all degrees uniformly tmp += numElevate; - bases.setDegree(tmp); + basisH.setDegree(tmp); + basisL.setDegree(tmp); } // Increase and p-refine the basis if (numIncrease > 0) - bases.degreeIncrease(numIncrease); - + { + basisH.degreeIncrease(numIncrease); + basisL.degreeIncrease(numIncrease); + } + // h-refine the basis for (int i = 0; i < numRefine; ++i) - bases.uniformRefine(); + { + basisH.uniformRefine(); + basisL.uniformRefine(); + } // Set the basis - K.setIntegrationElements(bases); - M.setIntegrationElements(bases); + K.setIntegrationElements(basisH); + M.setIntegrationElements(basisH); // Set the geometry map geometryMap G_K = K.getMap(mp); geometryMap G_M = M.getMap(mp); // Set the discretization space - space u_K = K.getSpace(bases); - space u_M = M.getSpace(bases); + space u_K = K.getSpace(basisH); + space u_M = M.getSpace(basisH); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); u_K.addBc( bc.get("Dirichlet") ); @@ -245,7 +292,7 @@ class gsXBraid_app : public gsXBraid< gsVector > char** argv) { // Problem parameters - std::string fn(XBRAID_DATA_DIR"pde/heat2d_square_ibvp.xml"); + std::string fn(XBRAID_DATA_DIR"pde/heat2d_square_ibvp1.xml"); // Spatial discretisation parameters index_t numRefine = 2; @@ -254,33 +301,9 @@ class gsXBraid_app : public gsXBraid< gsVector > // Temporal discretisation parameters index_t numSteps = 40; - index_t typeMethod = (index_t)gsXBraid_typeMethod::CN_CN; + index_t typeMethod = (index_t)gsXBraid_typeMethod::CN_BE; T tfinal = 0.1; - // Parallel-in-time multigrid parameters - index_t CFactor = 2; - index_t access = 1; - index_t maxIter = 100; - index_t maxLevel = 30; - index_t minCLevel = 2; - index_t numFMG = 1; - index_t numFMGVcyc = 1; - index_t numMaxRef = 1; - index_t numRelax = 1; - index_t numStorage =-1; - index_t print = 2; - index_t tnorm = 2; // 1-norm, 2-norm, inf-norm - - T absTol = 1e-10; - T relTol = 1e-3; - - bool fmg = false; - bool incrMaxLevels = false; - bool periodic = false; - bool refine = false; - bool sequential = false; - bool skip = true; - gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); // Problem parameters @@ -297,64 +320,11 @@ class gsXBraid_app : public gsXBraid< gsVector > cmd.addInt( "n", "numSteps", "Number of parallel-in-time steps", numSteps ); cmd.addInt( "T", "typeMethod", "Time-stepping scheme", typeMethod); cmd.addReal( "t", "tfinal", "Final time", tfinal ); - - // Parallel-in-time multigrid parameters - cmd.addInt( "", "numStorage", "Number of storage of the parallel-in-time multigrid solver", numStorage ); - cmd.addInt( "A", "access", "Access level (neve [=0], =after finished [=1(default)], each iteration [=2]", access ); - cmd.addInt( "C", "CFactor", "Coarsening factor of the parallel-in-time multigrid solver", CFactor ); - cmd.addInt( "F", "numFMG", "Number of full multigrid steps of the parallel-in-time multigrid solver", numFMG ); - cmd.addInt( "L", "maxLevel", "Maximum numbers of parallel-in-time multigrid levels", maxLevel ); - cmd.addInt( "M", "maxIter", "Maximum iteration numbers of the parallel-in-time multigrid solver", maxIter ); - cmd.addInt( "N", "norm", "Temporal norm of the parallel-in-time multigrid solver (1-norm [=1], 2-norm [=2(default)], inf-norm [=3])", tnorm ); - cmd.addInt( "P", "print", "Print level (no output [=0], =runtime inforation [=1], run statistics [=2(default)], debug [=3])", print ); - cmd.addInt( "R", "numMaxRef", "Maximum number of refinements of the parallel-in-time multigrid solver", numMaxRef ); - cmd.addInt( "V", "numFMGVcyc", "Number of full multigrid V-cycles of the parallel-in-time multigrid solver", numFMGVcyc ); - cmd.addInt( "X", "numRelax", "Number of relaxation steps of the parallel-in-time multigrid solver", numRelax ); - cmd.addInt( "l", "minCLevel", "Minimum level of the parallel-in-time multigrid solver", minCLevel ); - - cmd.addReal( "", "absTol", "Absolute tolerance of the parallel-in-time multigrid solver", absTol ); - cmd.addReal( "", "relTol", "Relative tolerance of the parallel-in-time multigrid solver", relTol ); - - cmd.addSwitch( "fmg" , "Perform full multigrid (default is off)", fmg); - cmd.addSwitch( "incrMaxLevels" , "Increase the maximum number of parallel-in-time multigrid levels after performing a refinement (default is off)", incrMaxLevels); - cmd.addSwitch( "periodic" , "Periodic time grid (default is off)", periodic); - cmd.addSwitch( "refine" , "Perform refinement in time (default off)", refine); - cmd.addSwitch( "sequential", "Set the initial guess of the parallel-in-time multigrid solver as the sequential time stepping solution (default is off)", sequential); - cmd.addSwitch( "skip" , "Skip all work on the first down cycle of the parallel-in-time multigrid solver (default on)", skip); cmd.getValues(argc,argv); // Create instance gsXBraid_app app(comm, 0.0, tfinal, typeMethod, numSteps, numRefine, numElevate, numIncrease, fn); - - if (absTol != 1e-10) - app.SetAbsTol(absTol); - else if (relTol != 1e-3) - app.SetRelTol(relTol); - else - app.SetAbsTol(absTol); - - app.SetCFactor(CFactor); - app.SetMaxIter(maxIter); - app.SetMaxLevels(maxLevel); - app.SetMaxRefinements(numMaxRef); - app.SetMinCoarse(minCLevel); - app.SetNFMG(numFMG); - app.SetNFMGVcyc(numFMGVcyc); - app.SetNRelax(numRelax); - app.SetAccessLevel(access); - app.SetPrintLevel(print); - app.SetStorage(numStorage); - app.SetTemporalNorm(tnorm); - - if (fmg) app.SetFMG(); - if (incrMaxLevels) app.SetIncrMaxLevels(); - if (periodic) app.SetPeriodic(1); else app.SetPeriodic(0); - if (refine) app.SetRefine(1); else app.SetRefine(0); - if (sequential) app.SetSeqSoln(1); else app.SetSeqSoln(0); - if (skip) app.SetSkip(1); else app.SetSkip(0); - - //app.SetSpatialCoarsenAndRefine(); return app; } diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml deleted file mode 100644 index f6d97ead52..0000000000 --- a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp.xml +++ /dev/null @@ -1,69 +0,0 @@ - - - - - - 100 100 - - 100 1 - 100 2 - 100 3 - 100 4 - - - - - 1 - - - - 0 - 1 - - - - 0 1 0 2 0 3 - - - - - 0 4 - - - - - 0 - - - 0 - - - - - - - - - - - - - - - - - - - - - - 0.00000 0.00000 1.00000 1.00000 - - - 0.00000 0.00000 1.00000 1.00000 - - - 0 0 1 0 0 1 1 1 - - - diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml new file mode 100644 index 0000000000..f6868aa6da --- /dev/null +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -0,0 +1,110 @@ + + + + + + 100 100 + + 100 1 + 100 2 + 100 3 + 100 4 + + + + + 1 + + + + 0 + 1 + + + + 0 1 0 2 0 3 + + + + + 0 4 + + + + + 0 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + + diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml new file mode 100644 index 0000000000..4fd10a4898 --- /dev/null +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp2.xml @@ -0,0 +1,108 @@ + + + + + + 100 100 + + 100 1 + 100 2 + 100 3 + 100 4 + + + + + 1 + + + + 0 + 1 + + + + 0 1 0 2 0 3 + + + + + 0 4 + + + + + 0 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + + diff --git a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 338e6f10a8..6ad8512eaf 100644 --- a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -186,14 +186,19 @@ class IncompleteLUT : public SparseSolverBase m_P; // Fill-reducing permutation PermutationMatrix m_Pinv; // Inverse permutation }; From aa88c40b3f7435106be29a1f511cc3fab87635f4 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 13 May 2021 15:00:46 +0200 Subject: [PATCH 033/174] [WIP] XBraid with p-multigrid --- .../gsXBraid/examples/gsXBraidMultigrid.h | 2063 ++++++++--------- .../examples/xbraid_heatEquation_example.cpp | 175 +- .../filedata/pde/heat2d_square_ibvp1.xml | 28 +- 3 files changed, 1125 insertions(+), 1141 deletions(-) diff --git a/extensions/gsXBraid/examples/gsXBraidMultigrid.h b/extensions/gsXBraid/examples/gsXBraidMultigrid.h index f4b006a3ae..931f27a5b4 100644 --- a/extensions/gsXBraid/examples/gsXBraidMultigrid.h +++ b/extensions/gsXBraid/examples/gsXBraidMultigrid.h @@ -3,1127 +3,1068 @@ namespace gismo { -/** @brief The p-multigrid base class provides the basic - * methods (smoothing, prolongation, restriction) for - * implementing p-multigrid methods - */ - -template -struct gsXBraidMultigridBase -{ - -public: - - /// @brief Apply p-multigrid solver to given right-hand side on level l - virtual void solve(const gsMatrix & rhs, - std::vector > > m_basis, - gsMatrix& x, - const int& numLevels, - const int& numCoarsening, - const int& numRefine, - const int& numSmoothing, - int& numCoarseCycles, - const int& typeCycle_p, - int& typeCycle_h, - const int& typeSolver, - const int& typeBCHandling, - gsBoundaryConditions bcInfo, - gsMultiPatch<> mp, - gsGeometry<>::Ptr geo, - const int& typeLumping, - const int& typeProjection, - const int& typeSmoother, - std::vector >& m_prolongation_P, - std::vector >& m_restriction_P, - std::vector >& m_prolongation_M, - std::vector >& m_restriction_M, - std::vector >& m_prolongation_H, - std::vector >& m_restriction_H, - const gsMatrix<>& hp) + /** @brief The p-multigrid base class provides the basic + * methods (smoothing, prolongation, restriction) for + * implementing p-multigrid methods + */ + + template + struct gsXBraidMultigridBase { - if( numLevels == 1) - { - solvecoarse(rhs, x, numLevels); - return; - } - - - if(hp(std::max(numLevels-2,0),0) == 0 ) - { - gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; - presmoothing(rhs, x, numLevels, numSmoothing, fineRes, numRefine, typeSmoother,hp); - restriction(fineRes, coarseRes, numLevels, numCoarsening, m_basis, typeLumping, - typeBCHandling, bcInfo, mp, geo, typeProjection, + protected: + int maxIter; + int numLevels; + int numSmoothing; + int typeBCHandling; + int typeCycle_h; + int typeCycle_p; + int typeLumping; + int typeProjection; + int typeSmoother; + int typeSolver; + T tol; + + public: + /// @brief Constructor + gsXBraidMultigridBase() + : maxIter(100000), + numLevels(1), + numSmoothing(1), + typeBCHandling(1), + typeCycle_h(2), + typeCycle_p(1), + typeLumping(1), + typeProjection(1), + typeSmoother(1), + typeSolver(1), + tol(1e-8) + {} + + void setMaxIter(int maxIter) + { this->maxIter = maxIter; } + + void setTolerance(T tol) + { this->tol = tol; } + + void setNumLevels(int numLevels) + { this->numLevels = numLevels; } + + void setNumSmoothing(int numSmoothing) + { this->numSmoothing = numSmoothing; } + + void setTypeBCHandling(int typeBCHandling) + { this->typeBCHandling = typeBCHandling; } + + void setTypeCycle_h(int typeCycle_h) + { this->typeCycle_h = typeCycle_h; } + + void setTypeCycle_p(int typeCycle_p) + { this->typeCycle_p = typeCycle_p; } + + void setTypeLumping(int typeLumping) + { this->typeLumping = typeLumping; } + + void setTypeProjection(int typeProjection) + { this->typeProjection = typeProjection; } + + void setTypeSmoother(int typeSmoother) + { this->typeSmoother = typeSmoother; } + + void setTypeSolver(int typeSolver) + { this->typeSolver = typeSolver; } + + virtual gsXBraidMultigridBase& compute(const gsSparseMatrix&) + { return *this; } + + virtual gsMatrix solveWithGuess(const gsMatrix& b, + const gsMatrix& x0) + { + gsMatrix x(x0); + solvecoarse(b, x, 1); + return x; + } + + /// @brief Apply p-multigrid solver to given right-hand side on level l + virtual void solve(const gsMatrix & rhs, + std::vector > > m_bases, + gsMatrix& x, + const int& numLevels, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if ( numLevels == 1) + { + solvecoarse(rhs, x, numLevels); + return; + } + + + if (hp(std::max(numLevels-2,0),0) == 0 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, fineRes, hp); + restriction(fineRes, coarseRes, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int j = 0 ; j < (typeCycle_p == 2 ? 2 : 1) ; j++) + { + solve(coarseRes, m_bases, coarseCorr, numLevels-1, + bcInfo, mp, m_prolongation_P, m_restriction_P, m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - //coarseRes.setZero(coarseRes.rows(),1); - coarseCorr.setZero(coarseRes.rows(),1); - for( int j = 0 ; j < (typeCycle_p == 2 ? 2 : 1) ; j++) - { - solve(coarseRes, m_basis, coarseCorr, numLevels-1, numCoarsening, numRefine, - numSmoothing, numCoarseCycles, typeCycle_p, typeCycle_h, typeSolver, - typeBCHandling, bcInfo, mp, geo, typeLumping, typeProjection, typeSmoother, - m_prolongation_P, m_restriction_P, - m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - } - prolongation(coarseCorr, fineCorr, numLevels, numCoarsening, m_basis, typeLumping, - typeBCHandling, bcInfo, mp, geo, typeProjection, - m_prolongation_P, m_restriction_P, - m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - postsmoothing(rhs, x, numLevels, numSmoothing, fineCorr, postRes, typeSolver, - numRefine, typeSmoother, hp); - } + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs, x, numLevels, fineCorr, postRes, + hp); + } - if(hp(std::max(numLevels-2,0),0) == 1 ) - { - gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; - presmoothing(rhs, x, numLevels, numSmoothing, fineRes, numRefine, typeSmoother, hp); - restriction(fineRes, coarseRes, numLevels, numCoarsening, m_basis, typeLumping, - typeBCHandling, bcInfo, mp, geo, typeProjection, + if (hp(std::max(numLevels-2,0),0) == 1 ) + { + gsMatrix fineRes, coarseRes, fineCorr, coarseCorr, postRes; + presmoothing(rhs, x, numLevels, fineRes, hp); + restriction(fineRes, coarseRes, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + //coarseRes.setZero(coarseRes.rows(),1); + coarseCorr.setZero(coarseRes.rows(),1); + for( int i = 0 ; i < (typeCycle_h == 2 ? 2 : 1) ; i++) + { + solve(coarseRes, m_bases, coarseCorr, numLevels-1, + bcInfo, mp, m_prolongation_P, m_restriction_P, m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - //coarseRes.setZero(coarseRes.rows(),1); - coarseCorr.setZero(coarseRes.rows(),1); - for( int i = 0 ; i < (typeCycle_h == 2 ? 2 : 1) ; i++) - { - solve(coarseRes, m_basis, coarseCorr, numLevels-1, numCoarsening, numRefine, - numSmoothing, numCoarseCycles, typeCycle_p, typeCycle_h, typeSolver, - typeBCHandling, bcInfo, mp, geo, typeLumping, typeProjection, typeSmoother, - m_prolongation_P, m_restriction_P, - m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - } - prolongation(coarseCorr, fineCorr, numLevels, numCoarsening, m_basis, typeLumping, - typeBCHandling, bcInfo, mp, geo, typeProjection, - m_prolongation_P, m_restriction_P, - m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - postsmoothing(rhs,x, numLevels, numSmoothing, fineCorr, postRes, typeSolver, - numRefine, typeSmoother, hp); - } - } - - /// @brief Setup p-multigrid to given linear system - virtual void setup(const gsMatrix & rhs, - std::vector > > m_basis, - gsMatrix& x, - const int& numLevels, - const int& numCoarsening, - const int& numRefine, - const int& numSmoothing, - int& numCoarseCycles, - const int& typeCycle_p, - const int& typeCycle_h, - const int& typeSolver, - const int& typeBCHandling, - gsBoundaryConditions bcInfo, - gsMultiPatch<> mp, - gsGeometry<>::Ptr geo, - const int& typeLumping, - const int& typeProjection, - const int& typeSmoother, - std::vector >& m_prolongation_P, - std::vector >& m_restriction_P, - std::vector >& m_prolongation_M, - std::vector >& m_restriction_M, - std::vector >& m_prolongation_H, - std::vector >& m_restriction_H, - const gsMatrix<>& hp) {} + m_prolongation_H, m_restriction_H, hp); + } + prolongation(coarseCorr, fineCorr, numLevels, m_bases, + bcInfo, mp, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + postsmoothing(rhs,x, numLevels, fineCorr, postRes, + hp); + } + } + + /// @brief Setup p-multigrid to given linear system + virtual void setup(const gsMatrix & rhs, + std::vector > > m_bases, + gsMatrix& x, + const int& numLevels, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) {} - /// @brief Apply fixed number of smoothing steps (pure virtual method) - virtual void presmoothing(const gsMatrix& rhs, - gsMatrix& x, - const int& numLevels, - const int& numSmoothing, - gsMatrix & fineRes , - const int& numRefine, - const int& typeSmoother, - const gsMatrix<>& hp) = 0; - - /// @brief Apply fixed number of smoothing steps (pure virtual method) - virtual void postsmoothing(const gsMatrix& rhs, + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineRes , + const gsMatrix& hp) = 0; + + /// @brief Apply fixed number of smoothing steps (pure virtual method) + virtual void postsmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineCorr, + gsMatrix & postRes, + const gsMatrix& hp) = 0; + + /// @brief Apply coarse solver (pure virtual method) + virtual void solvecoarse(const gsMatrix& rhs, gsMatrix& x, - const int& numLevels, - const int& numSmoothing, - gsMatrix & fineCorr, - gsMatrix & postRes, - const int& typeSolver, - const int& numRefine, - const int& typeSmoother, - const gsMatrix<>& hp) = 0; - - /// @brief Apply coarse solver (pure virtual method) - virtual void solvecoarse(const gsMatrix& rhs, - gsMatrix& x, - const int& numLevels) = 0; + const int& numLevels) = 0; - /// @brief Prolongate coarse space function to fine space - virtual gsSparseMatrix prolongation_P(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) = 0; + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_bases) = 0; - /// @brief Prolongate coarse space function to fine space - virtual gsSparseMatrix restriction_P(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) = 0; + /// @brief Prolongate coarse space function to fine space + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_bases) = 0; - /// @brief Prolongate coarse space function to fine space - virtual gsMatrix prolongation_M(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) = 0; + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix prolongation_M(const int& numLevels, + std::vector > > m_bases) = 0; - /// @brief Prolongate coarse space function to fine space - virtual gsMatrix restriction_M(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) = 0; + /// @brief Prolongate coarse space function to fine space + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_bases) = 0; - /// @brief Prolongate coarse space function to fine space - virtual void prolongation(const gsMatrix& Xcoarse, - gsMatrix& Xfine, - const int& numLevels, - const int& numCoarsening, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsBoundaryConditions bcInfo, - gsMultiPatch<> mp, - gsGeometry<>::Ptr geo, - const int& typeProjection, - std::vector >& m_prolongation_P, - std::vector >& m_restriction_P, - std::vector >& m_prolongation_M, - std::vector >& m_restriction_M, - std::vector >& m_prolongation_H, - std::vector >& m_restriction_H, - const gsMatrix<>& hp) - { - if(hp(numLevels-2,0) == 1) - { - Xfine = m_prolongation_H[numLevels-2]*Xcoarse; - } - else - { - if(typeLumping == 1) - { - gsVector<> temp = m_prolongation_P[numLevels-2]*Xcoarse; - gsMatrix<> M_L_inv = (m_prolongation_M[numLevels-2]).array().inverse(); - Xfine = (M_L_inv).cwiseProduct(temp); - } - else - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; - typedef gsExprAssembler::geometryMap geometryMap; - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; + /// @brief Prolongate coarse space function to fine space + virtual void prolongation(const gsMatrix& Xcoarse, + gsMatrix& Xfine, + const int& numLevels, + std::vector > > m_bases, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if (hp(numLevels-2,0) == 1) + { + Xfine = m_prolongation_H[numLevels-2]*Xcoarse; + } + else + { + if (typeLumping == 1) + { + gsMatrix temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsMatrix M_L_inv = (m_prolongation_M[numLevels-2]).array().inverse(); + Xfine = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; - // Determine matrix M (high_order * high_order) - gsExprAssembler ex2(1,1); - geometryMap G2 = ex2.getMap(mp); - space w_n = ex2.getSpace(basisH ,1, 0); - w_n.setInterfaceCont(0); - if(typeBCHandling == 1) - { - w_n.addBc(bcInfo.get("Dirichlet")); - } - ex2.setIntegrationElements(basisH); - ex2.initSystem(); - ex2.assemble(w_n * meas(G2) * w_n.tr()); + // Determine matrix M (high_order * high_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basesH ,1, 0); + w_n.setInterfaceCont(0); + if (typeBCHandling == 1) + { + w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basesH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); - // Prolongate Xcoarse to Xfine - gsVector<> temp = m_prolongation_P[numLevels-2]*Xcoarse; - gsSparseMatrix<> M = ex2.matrix(); - gsConjugateGradient<> CGSolver(M); - CGSolver.setTolerance(1e-12); - CGSolver.solve(temp,Xfine); - } - } - } - - /// @brief Restrict fine space function to coarse space - virtual void restriction(const gsMatrix& Xfine, - gsMatrix& Xcoarse, - const int& numLevels, - const int& numCoarsening, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsBoundaryConditions bcInfo, - gsMultiPatch<> mp, - gsGeometry<>::Ptr geo, - const int& typeProjection, - std::vector >& m_prolongation_P, - std::vector >& m_restriction_P, - std::vector >& m_prolongation_M, - std::vector >& m_restriction_M, - std::vector >& m_prolongation_H, - std::vector >& m_restriction_H, - const gsMatrix<>& hp) - { - if(hp(numLevels-2,0) == 1) - { - Xcoarse = m_restriction_H[numLevels-2]*Xfine; - } - else - { - if(typeLumping == 1) - { - // Standard way - gsVector<> temp = m_restriction_P[numLevels-2]*Xfine; - gsMatrix<> M_L_inv = (m_restriction_M[numLevels-2]).array().inverse(); - Xcoarse = (M_L_inv).cwiseProduct(temp); - } - else - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; - typedef gsExprAssembler::geometryMap geometryMap; - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; + // Prolongate Xcoarse to Xfine + gsMatrix temp = m_prolongation_P[numLevels-2]*Xcoarse; + gsSparseMatrix M = ex2.matrix(); + gsConjugateGradient CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp,Xfine); + } + } + } + + /// @brief Restrict fine space function to coarse space + virtual void restriction(const gsMatrix& Xfine, + gsMatrix& Xcoarse, + const int& numLevels, + std::vector > > m_bases, + gsBoundaryConditions bcInfo, + gsMultiPatch mp, + std::vector >& m_prolongation_P, + std::vector >& m_restriction_P, + std::vector >& m_prolongation_M, + std::vector >& m_restriction_M, + std::vector >& m_prolongation_H, + std::vector >& m_restriction_H, + const gsMatrix& hp) + { + if (hp(numLevels-2,0) == 1) + { + Xcoarse = m_restriction_H[numLevels-2]*Xfine; + } + else + { + if (typeLumping == 1) + { + // Standard way + gsMatrix temp = m_restriction_P[numLevels-2]*Xfine; + gsMatrix M_L_inv = (m_restriction_M[numLevels-2]).array().inverse(); + Xcoarse = (M_L_inv).cwiseProduct(temp); + } + else + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; - // Determine matrix M (low_order * low_order) - gsExprAssembler ex2(1,1); - geometryMap G2 = ex2.getMap(mp); - space w_n = ex2.getSpace(basisL, 1, 0); - w_n.setInterfaceCont(0); - if(typeBCHandling == 1) - { - w_n.addBc(bcInfo.get("Dirichlet")); - } - ex2.setIntegrationElements(basisL); - ex2.initSystem(); - ex2.assemble(w_n * meas(G2) * w_n.tr()); + // Determine matrix M (low_order * low_order) + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(mp); + space w_n = ex2.getSpace(basesL, 1, 0); + w_n.setInterfaceCont(0); + if (typeBCHandling == 1) + { + w_n.addBc(bcInfo.get("Dirichlet")); + } + ex2.setIntegrationElements(basesL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) * w_n.tr()); - // Restrict Xfine to Xcoarse - gsMatrix<> temp = m_restriction_P[numLevels-2]*Xfine; - gsSparseMatrix<> M = ex2.matrix(); - gsConjugateGradient<> CGSolver(M); - CGSolver.setTolerance(1e-12); - CGSolver.solve(temp, Xcoarse); - } - } - } -}; - -/** @brief The p-multigrid class implements a generic p-multigrid solver - * that can be customized by passing assembler and coarse - * solver as template arguments. - * - * @note: This implementation assumes that all required prolongation/ - * restriction operators are generated internally. Therefore, a - * problem-specific assembler has to be passed as template argument. - */ -template -struct gsXBraidMultigrid : public gsXBraidMultigridBase -{ -private: - - /// Base class type - typedef gsXBraidMultigridBase Base; - - /// Shared pointer to multi-patch geometry - memory::shared_ptr > m_mp_ptr; - - /// Shared pointer to boundary conditions - memory::shared_ptr > m_bcInfo_ptr; + // Restrict Xfine to Xcoarse + gsMatrix temp = m_restriction_P[numLevels-2]*Xfine; + gsSparseMatrix M = ex2.matrix(); + gsConjugateGradient CGSolver(M); + CGSolver.setTolerance(1e-12); + CGSolver.solve(temp, Xcoarse); + } + } + } + }; + + /** @brief The p-multigrid class implements a generic p-multigrid solver + * that can be customized by passing assembler and coarse + * solver as template arguments. + * + * @note: This implementation assumes that all required prolongation/ + * restriction operators are generated internally. Therefore, a + * problem-specific assembler has to be passed as template argument. + */ + template + struct gsXBraidMultigrid : public gsXBraidMultigridBase + { + private: + + /// Base class type + typedef gsXBraidMultigridBase Base; + + /// Shared pointer to multi-patch geometry + memory::shared_ptr > m_mp_ptr; + + /// Shared pointer to boundary conditions + memory::shared_ptr > m_bcInfo_ptr; - /// std::vector of multi-basis objects - std::vector > > m_basis; + /// std::vector of multi-basis objects + std::vector > > m_bases; - /// std::vector of prolongation operators - std::vector< gsSparseMatrix > m_prolongation_P; + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_P; - /// std::vector of restriction operators - std::vector< gsSparseMatrix > m_restriction_P; + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_P; - /// std::vector of prolongation operators - std::vector< gsMatrix > m_prolongation_M; + /// std::vector of prolongation operators + std::vector< gsMatrix > m_prolongation_M; - /// std::vector of restriction operators - std::vector< gsMatrix > m_restriction_M; + /// std::vector of restriction operators + std::vector< gsMatrix > m_restriction_M; - /// std::vector of prolongation operators - std::vector< gsSparseMatrix > m_prolongation_H; + /// std::vector of prolongation operators + std::vector< gsSparseMatrix > m_prolongation_H; - /// std::vector of restriction operators - std::vector< gsSparseMatrix > m_restriction_H; + /// std::vector of restriction operators + std::vector< gsSparseMatrix > m_restriction_H; - /// std::vector of factorized operators - std::vector< std::vector< gsSparseMatrix > > m_ILUT; + /// std::vector of factorized operators + std::vector< std::vector< gsSparseMatrix > > m_ILUT; - /// std::vector of factorized operators - std::vector< std::vector < Eigen::PermutationMatrix > > m_P; + /// std::vector of factorized operators + std::vector< std::vector < Eigen::PermutationMatrix > > m_P; - /// std::vector of factorized operators - std::vector < std::vector < Eigen::PermutationMatrix > > m_Pinv; + /// std::vector of factorized operators + std::vector < std::vector < Eigen::PermutationMatrix > > m_Pinv; - /// std::vector of SCM smoother object - std::vector< gsPreconditionerOp<>::Ptr > m_SCMS; + /// std::vector of SCM smoother object + std::vector< typename gsPreconditionerOp::Ptr > m_SCMS; - /// std::vector of operator objects - std::vector< gsSparseMatrix > m_operator; + /// std::vector of operator objects + std::vector< gsSparseMatrix > m_operator; - /// std::vector of std::vector of block operator objects - std::vector < std::vector< gsSparseMatrix > > m_block_operator; + /// std::vector of std::vector of block operator objects + std::vector < std::vector< gsSparseMatrix > > m_block_operator; - /// std::vector of std::vector of block operator objects - std::vector < std::vector < gsSparseMatrix > > m_ddB; + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddB; - /// std::vector of std::vector of block operator objects - std::vector < std::vector < gsSparseMatrix > > m_ddC; + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsSparseMatrix > > m_ddC; - /// std::vector of std::vector of block operator objects - std::vector < std::vector < gsMatrix > > m_ddBtilde; + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddBtilde; - /// std::vector of std::vector of block operator objects - std::vector < std::vector < gsMatrix > > m_ddCtilde; + /// std::vector of std::vector of block operator objects + std::vector < std::vector < gsMatrix > > m_ddCtilde; - /// std::vector of std::vector of block operator objects - std::vector < gsMatrix > m_A_aprox; + /// std::vector of std::vector of block operator objects + std::vector < gsMatrix > m_A_aprox; - /// std::vector of std::vector of block operator objects - std::vector < gsSparseMatrix > m_S; + /// std::vector of std::vector of block operator objects + std::vector < gsSparseMatrix > m_S; - /// std::vector of std::vector of shift objects - std::vector < std::vector< int > > m_shift; + /// std::vector of std::vector of shift objects + std::vector < std::vector< int > > m_shift; - /// std::vector of assembler objects - std::vector m_assembler; + /// std::vector of assembler objects + std::vector m_assembler; -public: + public: - // Constructor - gsXBraidMultigrid(const gsMultiPatch & mp, - const gsMultiBasis & basis, - const gsBoundaryConditions & bcInfo) - { - m_mp_ptr = memory::make_shared_not_owned(&mp); - m_bcInfo_ptr = memory::make_shared_not_owned(&bcInfo); - m_basis.push_back(memory::make_shared_not_owned(&basis)); - } - -public: - - /// @brief Set-up p-multigrid solver - void setup(const gsFunctionExpr & rhs, - const gsFunctionExpr & sol_exact, - gsMatrix& x, - const int& numSmoothing, - gsMatrix f, - const int& typeSolver, - int& iterTot, - int& typeCycle_p, - int& typeCycle_h, - int numLevels, - const int& numCoarsening, - const int& numDegree, - const int& numRefine, - const int& numBenchmark, - const int& typeMultigrid, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeLumping, - const gsMatrix<>& hp, - const int& typeProjection, - const int& typeSmoother, - const int& typeCoarseOperator, - const gsFunctionExpr<> coeff_diff, - const gsFunctionExpr<> coeff_conv, - const gsFunctionExpr<> coeff_reac) - { - for (int i = 1; i < numLevels; i++) - { - m_basis.push_back(give(m_basis.back()->clone())); - switch((int) hp(i-1,0) ) - { - case 0 : (typeProjection == 1 ? - m_basis.back()->degreeIncrease(numDegree-1) : - m_basis.back()->degreeIncrease()); break; - - case 1 : m_basis.back()->uniformRefine(); break; - - case 2: m_basis.back()->uniformRefine(); - m_basis.back()->degreeIncrease(); break; - } - } + // Constructor + gsXBraidMultigrid(const gsMultiPatch & mp, + const gsMultiBasis & bases, + const gsBoundaryConditions & bcInfo) + { + m_mp_ptr = memory::make_shared_not_owned(&mp); + m_bcInfo_ptr = memory::make_shared_not_owned(&bcInfo); + m_bases.push_back(memory::make_shared_not_owned(&bases)); + } + + virtual ~gsXBraidMultigrid() {} + + public: + + /// @brief Set-up p-multigrid solver + void setup(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& numDegree, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator, + const gsFunctionExpr coeff_diff, + const gsFunctionExpr coeff_conv, + const gsFunctionExpr coeff_reac) + { + for (int i = 1; i < numLevels; i++) + { + m_bases.push_back(give(m_bases.back()->clone())); + switch((int) hp(i-1,0) ) + { + case 0 : (Base::typeProjection == 1 ? + m_bases.back()->degreeIncrease(numDegree-1) : + m_bases.back()->degreeIncrease()); break; + + case 1 : m_bases.back()->uniformRefine(); break; + + case 2: m_bases.back()->uniformRefine(); + m_bases.back()->degreeIncrease(); break; + } + } - // Generate sequence of assembler objects and assemble - for (typename std::vector > >::iterator it = m_basis.begin(); - it != m_basis.end(); ++it) - { - m_assembler.push_back(Assembler(*m_mp_ptr, - *(*it).get(), - *m_bcInfo_ptr, - rhs, - coeff_diff, - coeff_conv, - coeff_reac, - (typeBCHandling == 1 ? - dirichlet::elimination : - dirichlet::nitsche), - iFace::glue)); - } + // Generate sequence of assembler objects and assemble + for (typename std::vector > >::iterator it = m_bases.begin(); + it != m_bases.end(); ++it) + { + m_assembler.push_back(Assembler(*m_mp_ptr, + *(*it).get(), + *m_bcInfo_ptr, + rhs, + coeff_diff, + coeff_conv, + coeff_reac, + (Base::typeBCHandling == 1 ? + dirichlet::elimination : + dirichlet::nitsche), + iFace::glue)); + } - // Resize vector of operators - m_operator.resize(numLevels); - m_prolongation_P.resize(numLevels-1); - m_prolongation_M.resize(numLevels-1); - m_prolongation_H.resize(numLevels-1); - m_restriction_P.resize(numLevels-1); - m_restriction_M.resize(numLevels-1); - m_restriction_H.resize(numLevels-1); - - // Assemble operators at finest level - gsStopwatch clock; - gsInfo << "|| Multigrid hierarchy ||" <degree() << ", Ndof: " << m_basis[i]->totalSize() <degree() <totalSize() <degree() << ", Ndof: " << m_bases[i]->totalSize() <degree() <totalSize() < transferMatrix; - gsOptionList options; - typeBCHandling == 1 ? options.addInt("DirichletStrategy","",dirichlet::elimination) : options.addInt("DirichletStrategy","",dirichlet::nitsche); - for(int i = 1; i < numLevels; i++) - { - if(hp(i-1,0) == 1) - { - gsMultiBasis m_basis_copy = *m_basis[i]; - m_basis_copy.uniformCoarsen_withTransfer(transferMatrix,*m_bcInfo_ptr,options); - m_prolongation_H[i-1] = transferMatrix; - m_restriction_H[i-1] = m_prolongation_H[i-1].transpose(); - } - } - real_t Time_Transfer = clock.stop(); + // Determine prolongation/restriction operators in p + clock.restart(); + for (int i = 1; i < numLevels; i++) + { + if (hp(i-1,0) == 0) + { + m_prolongation_P[i-1] = prolongation_P(i+1, m_bases); + m_restriction_P[i-1] = m_prolongation_P[i-1].transpose(); //restriction_P(i+1, m_bases); + m_prolongation_M[i-1] = prolongation_M(i+1, m_bases); + m_restriction_M[i-1] = restriction_M(i+1, m_bases); + } + } + + // Determine prolongation/restriction operators in h + gsSparseMatrix transferMatrix; + gsOptionList options; + Base::typeBCHandling == 1 ? options.addInt("DirichletStrategy","",dirichlet::elimination) : options.addInt("DirichletStrategy","",dirichlet::nitsche); + for(int i = 1; i < numLevels; i++) + { + if (hp(i-1,0) == 1) + { + gsMultiBasis m_bases_copy = *m_bases[i]; + m_bases_copy.uniformCoarsen_withTransfer(transferMatrix,*m_bcInfo_ptr,options); + m_prolongation_H[i-1] = transferMatrix; + m_restriction_H[i-1] = m_prolongation_H[i-1].transpose(); + } + } + real_t Time_Transfer = clock.stop(); - // Obtain operators with Galerkin projection - clock.restart(); - if(typeCoarseOperator == 2) - { - for (int i = numLevels-1; i > -1; i--) - { - if(hp(hp.rows()-1,0) == 0) - { - if(hp(std::min(i,hp.rows()-1),0) == 1) - { - m_operator[i] = m_restriction_H[i]*m_operator[i+1]*m_prolongation_H[i]; - } - } - else - { - if(hp(std::min(i,hp.rows()-1),0) == 1 && i > 0) - { - m_operator[i-1] = m_restriction_H[i-1]*m_operator[i]*m_prolongation_H[i-1]; - } - } - } - } - real_t Time_Assembly_Galerkin = clock.stop(); - - - // Setting up the subspace corrected mass smoother - clock.restart(); - if(typeSmoother == 3) - { - // Generate sequence of SCM smoothers - m_SCMS.resize(numLevels); - gsOptionList opt; - opt.addReal("Scaling","",0.12); - for(int i = 0 ; i < numLevels ; i++) - { - m_SCMS[i] = setupSubspaceCorrectedMassSmoother(m_operator[i], *m_basis[i], *m_bcInfo_ptr, opt, typeBCHandling); - } - } - real_t Time_SCMS = clock.stop(); - - // Determine ILUT factorizations at each level - clock.restart(); - int numPatch = m_mp_ptr->nPatches(); + // Obtain operators with Galerkin projection + clock.restart(); + if (typeCoarseOperator == 2) + { + for (int i = numLevels-1; i > -1; i--) + { + if (hp(hp.rows()-1,0) == 0) + { + if (hp(std::min(i,hp.rows()-1),0) == 1) + { + m_operator[i] = m_restriction_H[i]*m_operator[i+1]*m_prolongation_H[i]; + } + } + else + { + if (hp(std::min(i,hp.rows()-1),0) == 1 && i > 0) + { + m_operator[i-1] = m_restriction_H[i-1]*m_operator[i]*m_prolongation_H[i-1]; + } + } + } + } + real_t Time_Assembly_Galerkin = clock.stop(); + + + // Setting up the subspace corrected mass smoother + clock.restart(); + if (Base::typeSmoother == 3) + { + // Generate sequence of SCM smoothers + m_SCMS.resize(numLevels); + gsOptionList opt; + opt.addReal("Scaling","",0.12); + for(int i = 0 ; i < numLevels ; i++) + { + m_SCMS[i] = setupSubspaceCorrectedMassSmoother(m_operator[i], *m_bases[i], *m_bcInfo_ptr, opt, Base::typeBCHandling); + } + } + real_t Time_SCMS = clock.stop(); + + // Determine ILUT factorizations at each level + clock.restart(); + int numPatch = m_mp_ptr->nPatches(); - if(typeSmoother == 1) - { - // Generate factorizations (ILUT) - m_ILUT.resize(numLevels); - m_P.resize(numLevels); - m_Pinv.resize(numLevels); - for(int i = 0; i < numLevels; i++) - { - m_ILUT[i].resize(1); - m_P[i].resize(1); - m_Pinv[i].resize(1); - if(typeProjection == 2) - { - Eigen::IncompleteLUT ilu; - ilu.setFillfactor(1); - ilu.compute(m_operator[i]); - m_ILUT[i][0] = ilu.m_lu; - m_P[i][0] = ilu.m_P; - m_Pinv[i][0] = ilu.m_Pinv; - } - else - { - if(i == numLevels-1) // Only at finest level - { - Eigen::IncompleteLUT ilu; - ilu.setFillfactor(1); - ilu.compute(m_operator[i]); - m_ILUT[i][0] = ilu.m_lu; - m_P[i][0] = ilu.m_P; - m_Pinv[i][0] = ilu.m_Pinv; - } - } - } - } - real_t Time_ILUT_Factorization = clock.stop(); - clock.restart(); - if(typeSmoother == 5) - { - int shift0 = 0; - m_ddB.resize(numLevels); - m_ddC.resize(numLevels); - m_ddBtilde.resize(numLevels); - m_ddCtilde.resize(numLevels); - - m_ILUT.resize(numLevels); - m_P.resize(numLevels); - m_Pinv.resize(numLevels); - m_shift.resize(numLevels); - m_S.resize(numLevels); + if (Base::typeSmoother == 1) + { + // Generate factorizations (ILUT) + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + for(int i = 0; i < numLevels; i++) + { + m_ILUT[i].resize(1); + m_P[i].resize(1); + m_Pinv[i].resize(1); + if (Base::typeProjection == 2) + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + else + { + if (i == numLevels-1) // Only at finest level + { + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(m_operator[i]); + m_ILUT[i][0] = ilu.m_lu; + m_P[i][0] = ilu.m_P; + m_Pinv[i][0] = ilu.m_Pinv; + } + } + } + } + real_t Time_ILUT_Factorization = clock.stop(); + clock.restart(); + if (Base::typeSmoother == 5) + { + int shift0 = 0; + m_ddB.resize(numLevels); + m_ddC.resize(numLevels); + m_ddBtilde.resize(numLevels); + m_ddCtilde.resize(numLevels); + + m_ILUT.resize(numLevels); + m_P.resize(numLevels); + m_Pinv.resize(numLevels); + m_shift.resize(numLevels); + m_S.resize(numLevels); - for(int i = 0 ; i < numLevels ; i++) - { - m_shift[i].resize(numPatch+1); - m_ILUT[i].resize(numPatch+1); - m_P[i].resize(numPatch+1); - m_Pinv[i].resize(numPatch+1); + for(int i = 0 ; i < numLevels ; i++) + { + m_shift[i].resize(numPatch+1); + m_ILUT[i].resize(numPatch+1); + m_P[i].resize(numPatch+1); + m_Pinv[i].resize(numPatch+1); - // Use of partition functions - std::vector > interior, boundary; - std::vector > > interface; - std::vector > global_interior, global_boundary; - std::vector > > global_interface; - //m_basis[i]->partition(interior,boundary,interface,global_interior,global_boundary,global_interface); - for(int l=0; l< numPatch; l++) - { - m_shift[i][l] = global_interior[l].rows(); - } - m_shift[i][numPatch] = 0; - m_shift[i][numPatch] = m_operator[i].rows() - accumulate(m_shift[i].begin(),m_shift[i].end(),0); - - // Put shift on zero - shift0 = 0; - for(int j = 0 ; j < numPatch ; j++) - { - const gsSparseMatrix<> block = m_operator[i].block(shift0,shift0,m_shift[i][j],m_shift[i][j]); - Eigen::IncompleteLUT ilu; - ilu.setFillfactor(1); - ilu.compute(block); - m_ILUT[i][j] = ilu.m_lu; - - m_P[i][j] = ilu.m_P; - m_Pinv[i][j] = ilu.m_Pinv; - shift0 = shift0 + m_shift[i][j]; + // Use of partition functions + std::vector > interior, boundary; + std::vector > > interface; + std::vector > global_interior, global_boundary; + std::vector > > global_interface; + //m_bases[i]->partition(interior,boundary,interface,global_interior,global_boundary,global_interface); + for(int l=0; l< numPatch; l++) + { + m_shift[i][l] = global_interior[l].rows(); + } + m_shift[i][numPatch] = 0; + m_shift[i][numPatch] = m_operator[i].rows() - accumulate(m_shift[i].begin(),m_shift[i].end(),0); + + // Put shift on zero + shift0 = 0; + for(int j = 0 ; j < numPatch ; j++) + { + const gsSparseMatrix block = m_operator[i].block(shift0,shift0,m_shift[i][j],m_shift[i][j]); + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + ilu.compute(block); + m_ILUT[i][j] = ilu.m_lu; + + m_P[i][j] = ilu.m_P; + m_Pinv[i][j] = ilu.m_Pinv; + shift0 = shift0 + m_shift[i][j]; - } + } - shift0 = 0; - // Obtain the blocks of the matrix - m_ddB[i].resize(numPatch+1); - m_ddC[i].resize(numPatch+1); + shift0 = 0; + // Obtain the blocks of the matrix + m_ddB[i].resize(numPatch+1); + m_ddC[i].resize(numPatch+1); - for(int j = 0 ; j < numPatch+1 ; j++) - { - m_ddB[i][j] = m_operator[i].block(m_operator[i].rows()-m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][j]); - m_ddC[i][j] = m_operator[i].block(shift0,m_operator[i].cols()-m_shift[i][numPatch],m_shift[i][j],m_shift[i][numPatch]); - shift0 = shift0 + m_shift[i][j]; - } - shift0 = 0; - } - - m_A_aprox.resize(numLevels); - for(int i = 0 ; i < numLevels ; i++) - { - // Define the A_aprox matrix - m_A_aprox[i] = gsSparseMatrix<>(m_operator[i].rows(),m_operator[i].cols()); - - // Retrieve a block of each patch - for(int k=0; k< numPatch; k++) - { - m_A_aprox[i].block(shift0,shift0,m_shift[i][k],m_shift[i][k]) = m_ILUT[i][k]; - shift0 = shift0 + m_shift[i][k]; - } - shift0 = 0; - m_ddBtilde[i].resize(numPatch); - m_ddCtilde[i].resize(numPatch); - - for(int j=0 ; j < numPatch ; j ++) - { - m_ddBtilde[i][j] = gsSparseMatrix<>(m_shift[i][j],m_shift[i][numPatch]); - m_ddCtilde[i][j] = gsSparseMatrix<>(m_shift[i][j],m_shift[i][numPatch]); - for(int k=0 ; k < m_shift[i][numPatch]; k++) - { - gsMatrix<> Brhs = m_ddC[i][j].col(k); - gsMatrix<> Crhs = m_ddC[i][j].col(k); - m_ddBtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().transpose().solve(Brhs); - m_ddCtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().solve(Crhs); - } - } - - // Define matrix S - m_S[i] = m_ddC[i][numPatch]; - for(int l = 0 ; l < numPatch ; l++) - { - m_S[i] = m_S[i] - m_ddBtilde[i][l].transpose()*m_ddCtilde[i][l]; - } + for(int j = 0 ; j < numPatch+1 ; j++) + { + m_ddB[i][j] = m_operator[i].block(m_operator[i].rows()-m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][j]); + m_ddC[i][j] = m_operator[i].block(shift0,m_operator[i].cols()-m_shift[i][numPatch],m_shift[i][j],m_shift[i][numPatch]); + shift0 = shift0 + m_shift[i][j]; + } + shift0 = 0; + } + + m_A_aprox.resize(numLevels); + for(int i = 0 ; i < numLevels ; i++) + { + // Define the A_aprox matrix + m_A_aprox[i] = gsSparseMatrix(m_operator[i].rows(),m_operator[i].cols()); + + // Retrieve a block of each patch + for(int k=0; k< numPatch; k++) + { + m_A_aprox[i].block(shift0,shift0,m_shift[i][k],m_shift[i][k]) = m_ILUT[i][k]; + shift0 = shift0 + m_shift[i][k]; + } + shift0 = 0; + m_ddBtilde[i].resize(numPatch); + m_ddCtilde[i].resize(numPatch); + + for(int j=0 ; j < numPatch ; j ++) + { + m_ddBtilde[i][j] = gsSparseMatrix(m_shift[i][j],m_shift[i][numPatch]); + m_ddCtilde[i][j] = gsSparseMatrix(m_shift[i][j],m_shift[i][numPatch]); + for(int k=0 ; k < m_shift[i][numPatch]; k++) + { + gsMatrix Brhs = m_ddC[i][j].col(k); + gsMatrix Crhs = m_ddC[i][j].col(k); + m_ddBtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().transpose().solve(Brhs); + m_ddCtilde[i][j].col(k) = m_ILUT[i][j].template triangularView().solve(Crhs); + } + } + + // Define matrix S + m_S[i] = m_ddC[i][numPatch]; + for(int l = 0 ; l < numPatch ; l++) + { + m_S[i] = m_S[i] - m_ddBtilde[i][l].transpose()*m_ddCtilde[i][l]; + } - // Fill matrix A_aprox - for(int m = 0 ; m < numPatch ; m++) - { - m_A_aprox[i].block(shift0,m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][m],m_shift[i][numPatch]) = m_ddCtilde[i][m]; - m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][m]) = m_ddBtilde[i][m].transpose(); - shift0 = shift0 + m_shift[i][m]; - } - shift0 = 0; + // Fill matrix A_aprox + for(int m = 0 ; m < numPatch ; m++) + { + m_A_aprox[i].block(shift0,m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][m],m_shift[i][numPatch]) = m_ddCtilde[i][m]; + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],shift0,m_shift[i][numPatch],m_shift[i][m]) = m_ddBtilde[i][m].transpose(); + shift0 = shift0 + m_shift[i][m]; + } + shift0 = 0; - // Preform ILUT on the S-matrix! - Eigen::IncompleteLUT ilu; - ilu.setFillfactor(1); - gsSparseMatrix<> II = m_S[i]; - ilu.compute(II); - m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][numPatch],m_shift[i][numPatch]) = ilu.m_lu; - } - } + // Perform ILUT on the S-matrix! + Eigen::IncompleteLUT ilu; + ilu.setFillfactor(1); + gsSparseMatrix II = m_S[i]; + ilu.compute(II); + m_A_aprox[i].block(m_A_aprox[i].rows() - m_shift[i][numPatch],m_A_aprox[i].rows() - m_shift[i][numPatch],m_shift[i][numPatch],m_shift[i][numPatch]) = ilu.m_lu; + } + } - real_t Time_Block_ILUT_Factorization = clock.stop(); - gsInfo << "\n|| Setup Timings || " < & rhs, - const gsFunctionExpr & sol_exact, - gsMatrix& x, - const int& numSmoothing, - gsMatrix f, - const int& typeSolver, - int& iterTot, - int& typeCycle_p, - int& typeCycle_h, - int numLevels, - const int& numCoarsening, - const int& numDegree, - const int& numRefine, - const int& numBenchmark, - const int& typeMultigrid, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeLumping, - const gsMatrix<>& hp, - const int& typeProjection, - const int& typeSmoother, - const int& typeCoarseOperator) - { - gsStopwatch clock; + real_t Time_Block_ILUT_Factorization = clock.stop(); + gsInfo << "\n|| Setup Timings || " < & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator) + { + gsStopwatch clock; - if(typeSolver == 1) - { - x = gsMatrix<>::Random(m_operator[numLevels-1].rows(),1); - } + if (Base::typeSolver == 1) + { + x = gsMatrix::Random(m_operator[numLevels-1].rows(),1); + } - gsMatrix<> b; - typeSolver == 1 ? b = m_assembler.back().rhs() : b = f; - - - // Determine residual and L2 error - real_t r0 = (m_operator[numLevels-1]*x - b).norm(); - real_t r = r0; - real_t tol = 1e-8; - int iter = 1; - int numCoarseCycles = 0; - - // Solve with p-multigrid method - real_t r_old = r0; - clock.restart(); - while( (typeSolver == 1 || typeSolver == 5) ? r/r0 > tol && iter < 100000 : iter < 2) - { - // Call solver from base class - Base::solve(b, m_basis, x, numLevels, numCoarsening, numRefine, numSmoothing, numCoarseCycles, - typeCycle_p, typeCycle_h, typeSolver, typeBCHandling, *m_bcInfo_ptr, *m_mp_ptr, geo, - typeLumping, typeProjection, typeSmoother, - m_prolongation_P, m_restriction_P, - m_prolongation_M, m_restriction_M, - m_prolongation_H, m_restriction_H, hp); - numCoarseCycles = 0; - r = (m_operator[numLevels-1]*x - b).norm(); - if( r_old < r) - { - gsInfo << "Residual increased during solving!!! " < solMG = m_assembler.back().constructSolution(x); - // gsNormL2 L2Norm(solMG,sol_exact); - // real_t errorL2 = L2Norm.compute(); - // gsInfo << "Residual after solving: " << r <(solMG, "Multigrid_solution", 100*x.rows()); - // gsField<> Exact( *m_mp_ptr, sol_exact, false ); - // gsWriteParaview<>( Exact, "Exact_solution", 100*x.rows()); - } - } - -private: - - /// @brief Apply coarse solver - virtual void solvecoarse(const gsMatrix& rhs, - gsMatrix& x, - const int& numLevels) - { - gsInfo << "Coarse solver is applied! " < b; + Base::typeSolver == 1 ? b = m_assembler.back().rhs() : b = f; + + + // Determine residual and L2 error + real_t r0 = (m_operator[numLevels-1]*x - b).norm(); + real_t r = r0; + int iter = 1; + + // Solve with p-multigrid method + real_t r_old = r0; + clock.restart(); + while( (Base::typeSolver == 1 || Base::typeSolver == 5) ? r/r0 > Base::tol && iter < Base::maxIter : iter < 2) + { + // Call solver from base class + Base::solve(b, m_bases, x, numLevels, + *m_bcInfo_ptr, *m_mp_ptr, + m_prolongation_P, m_restriction_P, + m_prolongation_M, m_restriction_M, + m_prolongation_H, m_restriction_H, hp); + r = (m_operator[numLevels-1]*x - b).norm(); + if ( r_old < r) + { + gsInfo << "Residual increased during solving!!! " <& rhs, + gsMatrix& x, + const int& numLevels) + { + gsInfo << "Coarse solver is applied! " < prolongation_M(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; - - // Determine matrix M (high_order * high_order) - typedef gsExprAssembler::geometryMap geometryMap; - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; - gsExprAssembler ex2(1,1); - geometryMap G2 = ex2.getMap(*m_mp_ptr); - space w_n = ex2.getSpace(basisH ,1, 0); - w_n.setInterfaceCont(0); - if(typeBCHandling == 1) - { - w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - } - ex2.setIntegrationElements(basisH); - ex2.initSystem(); - ex2.assemble(w_n * meas(G2) ); - return ex2.rhs(); - } - - /// @brief Construct prolongation operator at level numLevels - virtual gsSparseMatrix prolongation_P(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; - - // Determine matrix P (high_order * low_order) - typedef gsExprAssembler::geometryMap geometryMap; - gsExprAssembler ex(1,1); - geometryMap G = ex.getMap(*m_mp_ptr); - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; - space v_n = ex.getSpace(basisH ,1, 0); - v_n.setInterfaceCont(0); - space u_n = ex.getTestSpace(v_n , basisL); - u_n.setInterfaceCont(0); - if(typeBCHandling == 1) - { - v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - } - ex.setIntegrationElements(basisH); - ex.initSystem(); - ex.assemble(u_n*meas(G) * v_n.tr()); - gsSparseMatrix<> P = ex.matrix().transpose(); - return P; - } - - /// @brief Construct restriction operator at level numLevels - virtual gsMatrix restriction_M(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; + /// @brief Construct prolongation operator at level numLevels + virtual gsMatrix prolongation_M(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix M (high_order * high_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basesH ,1, 0); + w_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basesH); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct prolongation operator at level numLevels + virtual gsSparseMatrix prolongation_P(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; + + // Determine matrix P (high_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + gsExprAssembler ex(1,1); + geometryMap G = ex.getMap(*m_mp_ptr); + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basesH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basesL); + u_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basesH); + ex.initSystem(); + ex.assemble(u_n*meas(G) * v_n.tr()); + gsSparseMatrix P = ex.matrix().transpose(); + return P; + } + + /// @brief Construct restriction operator at level numLevels + virtual gsMatrix restriction_M(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; - // Determine matrix M (low_order * low_order) - typedef gsExprAssembler::geometryMap geometryMap; - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; - gsExprAssembler ex2(1,1); - geometryMap G2 = ex2.getMap(*m_mp_ptr); - space w_n = ex2.getSpace(basisL ,1, 0); - w_n.setInterfaceCont(0); - if(typeBCHandling == 1) - { - w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - } - ex2.setIntegrationElements(basisL); - ex2.initSystem(); - ex2.assemble(w_n * meas(G2) ); - return ex2.rhs(); - } - - /// @brief Construct restriction operator at level numLevels - virtual gsSparseMatrix restriction_P(const int& numLevels, - std::vector > > m_basis, - const int& typeLumping, - const int& typeBCHandling, - gsGeometry<>::Ptr geo, - const int& typeProjection) - { - // Define the low and high order basis - gsMultiBasis<> basisL = *m_basis[numLevels-2]; - gsMultiBasis<> basisH = *m_basis[numLevels-1]; + // Determine matrix M (low_order * low_order) + typedef gsExprAssembler::geometryMap geometryMap; + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + gsExprAssembler ex2(1,1); + geometryMap G2 = ex2.getMap(*m_mp_ptr); + space w_n = ex2.getSpace(basesL ,1, 0); + w_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex2.setIntegrationElements(basesL); + ex2.initSystem(); + ex2.assemble(w_n * meas(G2) ); + return ex2.rhs(); + } + + /// @brief Construct restriction operator at level numLevels + virtual gsSparseMatrix restriction_P(const int& numLevels, + std::vector > > m_bases) + { + // Define the low and high order bases + gsMultiBasis basesL = *m_bases[numLevels-2]; + gsMultiBasis basesH = *m_bases[numLevels-1]; - // Determine matrix P (high_order * low_order) - gsExprAssembler ex(1,1); - typedef gsExprAssembler::geometryMap geometryMap; - geometryMap G = ex.getMap(*m_mp_ptr); + // Determine matrix P (high_order * low_order) + gsExprAssembler ex(1,1); + typedef gsExprAssembler::geometryMap geometryMap; + geometryMap G = ex.getMap(*m_mp_ptr); - typedef gsExprAssembler::variable variable; - typedef gsExprAssembler::space space; - space v_n = ex.getSpace(basisH ,1, 0); - v_n.setInterfaceCont(0); - space u_n = ex.getTestSpace(v_n , basisL); - u_n.setInterfaceCont(0); - if( typeBCHandling == 1) - { - u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - } - ex.setIntegrationElements(basisH); - ex.initSystem(); - ex.assemble(u_n * meas(G)* v_n.tr()); - gsSparseMatrix<> P = ex.matrix(); - return P; - } + typedef gsExprAssembler::variable variable; + typedef gsExprAssembler::space space; + space v_n = ex.getSpace(basesH ,1, 0); + v_n.setInterfaceCont(0); + space u_n = ex.getTestSpace(v_n , basesL); + u_n.setInterfaceCont(0); + if (Base::typeBCHandling == 1) + { + u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + } + ex.setIntegrationElements(basesH); + ex.initSystem(); + ex.assemble(u_n * meas(G)* v_n.tr()); + gsSparseMatrix P = ex.matrix(); + return P; + } - /// @brief Apply fixed number of presmoothing steps - virtual void presmoothing(const gsMatrix& rhs, - gsMatrix& x, - const int& numLevels, - const int& numSmoothing, - gsMatrix & fineRes, - const int& numRefine, - const int& typeSmoother, - const gsMatrix<>& hp) - { - gsInfo << "Residual before presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; - gsMatrix<> d = rhs-m_operator[numLevels-1]*x; - e = m_Pinv[numLevels-1][0]*d; - e = m_ILUT[numLevels-1][0].template triangularView().solve(e); - e = m_ILUT[numLevels-1][0].template triangularView().solve(e); - e = m_P[numLevels-1][0]*e; - x = x + e; - } - } - if(typeSmoother == 2) - { - internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); - } - if(typeSmoother == 3) - { - m_SCMS[numLevels-1]->step(rhs,x); - } - if(typeSmoother == 5) - { - if(hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) - { - internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); - } - else - { - gsMatrix<> e; - gsMatrix<> d = rhs-m_operator[numLevels-1]*x; - e = m_A_aprox[numLevels-1].template triangularView().solve(d); - e = m_A_aprox[numLevels-1].template triangularView().solve(e); - x = x + e; - } - } - } - // gsInfo << "Residual after presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels <& rhs, - gsMatrix& x, - const int& numLevels, - const int& numSmoothing, - gsMatrix & fineCorr, - gsMatrix & postRes, - const int& typeSolver, - const int& numRefine, - const int& typeSmoother, - const gsMatrix<>& hp) - { - real_t alpha = 1; - x = x - alpha*fineCorr; - gsInfo << "Residual before postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; - gsMatrix<> d = rhs-m_operator[numLevels-1]*x; - e = m_Pinv[numLevels-1][0]*d; - e = m_ILUT[numLevels-1][0].template triangularView().solve(e); - e = m_ILUT[numLevels-1][0].template triangularView().solve(e); - e = m_P[numLevels-1][0]*e; - x = x + e; - } - } - if(typeSmoother == 2) - { - ( typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); - } - if(typeSmoother == 3) - { - m_SCMS[numLevels-1]->step(rhs,x); - } - if(typeSmoother == 5) - { - if(hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) - { - ( typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); - } - else - { - gsMatrix<> e; - gsMatrix<> d = rhs-m_operator[numLevels-1]*x; - e = m_A_aprox[numLevels-1].template triangularView().solve(d); - e = m_A_aprox[numLevels-1].template triangularView().solve(e); - x = x + e; - } - } - postRes = rhs - m_operator[numLevels-1]*x; - // gsInfo << "Residual after postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < -struct gsXBraidMultigrid : public gsXBraidMultigridBase -{ - // Default constructor - gsXBraidMultigrid() + /// @brief Apply fixed number of presmoothing steps + virtual void presmoothing(const gsMatrix& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineRes, + const gsMatrix& hp) + { + gsInfo << "Residual before presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if (Base::typeSmoother == 2) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + if (Base::typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if (Base::typeSmoother == 5) + { + if (hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs); + } + else + { + gsMatrix e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + } + // gsInfo << "Residual after presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels <& rhs, + gsMatrix& x, + const int& numLevels, + gsMatrix & fineCorr, + gsMatrix & postRes, + const gsMatrix& hp) + { + real_t alpha = 1; + x = x - alpha*fineCorr; + gsInfo << "Residual before postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_Pinv[numLevels-1][0]*d; + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_ILUT[numLevels-1][0].template triangularView().solve(e); + e = m_P[numLevels-1][0]*e; + x = x + e; + } + } + if (Base::typeSmoother == 2) + { + ( Base::typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); + } + if (Base::typeSmoother == 3) + { + m_SCMS[numLevels-1]->step(rhs,x); + } + if (Base::typeSmoother == 5) + { + if (hp(numLevels-2,0) == 1 && hp(hp.rows()-1,0) == 0) + { + ( Base::typeSolver == 3 ? internal::reverseGaussSeidelSweep(m_operator[numLevels-1],x,rhs) : internal::gaussSeidelSweep(m_operator[numLevels-1],x,rhs)); + } + else + { + gsMatrix e; + gsMatrix d = rhs-m_operator[numLevels-1]*x; + e = m_A_aprox[numLevels-1].template triangularView().solve(d); + e = m_A_aprox[numLevels-1].template triangularView().solve(e); + x = x + e; + } + } + postRes = rhs - m_operator[numLevels-1]*x; + // gsInfo << "Residual after postsmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < + struct gsXBraidMultigrid : public gsXBraidMultigridBase { - gsInfo << "The specific case"; - } -}; + // Default constructor + gsXBraidMultigrid() + { + gsInfo << "The specific case"; + } + }; } // namespace gismo diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index 7fe1d3108f..0b4c6586d0 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -34,7 +34,7 @@ namespace gismo { \brief Derived class implementing the XBraid wrapper for the heat equation */ template -class gsXBraid_app : public gsXBraid< gsVector > +class gsXBraid_app : public gsXBraid< gsMatrix > { private: // Spatial discretisation parameters @@ -46,7 +46,7 @@ class gsXBraid_app : public gsXBraid< gsVector > // Spatial discretizations gsMultiPatch mp; - gsMultiBasis basisH, basisL; + gsMultiBasis basesH, basesL; // Boundary conditions gsBoundaryConditions bc; @@ -59,15 +59,15 @@ class gsXBraid_app : public gsXBraid< gsVector > gsFunctionExpr f, u0, ms; // Solution - gsVector sol; + gsMatrix sol; // Single-grid solver - typedef typename gsSparseSolver::CGDiagonal solver; - solver* m_solver; - + typedef typename gsSparseSolver::CGDiagonal solver_old; + solver_old* m_solver_old; + // Multigrid solver - typedef typename gsSparseSolver::LU lu; - gsXBraidMultigrid >* m_mgsolver; + typedef gsXBraidMultigrid::LU , gsCDRAssembler > solver_mg; + solver_mg* m_solver; gsMatrix hp; typedef typename gsExprAssembler::geometryMap geometryMap; @@ -86,7 +86,7 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numElevate, index_t numIncrease, std::string& fn) - : gsXBraid< gsVector >::gsXBraid(comm, tstart, tstop, (int)numSteps), + : gsXBraid< gsMatrix >::gsXBraid(comm, tstart, tstop, (int)numSteps), numRefine(numRefine), numElevate(numElevate), numIncrease(numIncrease), @@ -96,18 +96,19 @@ class gsXBraid_app : public gsXBraid< gsVector > tstop(tstop), tstep( (tstop-tstart)/numSteps ), K(1,1), M(1,1), - m_solver(new solver) + m_solver_old(nullptr), + m_solver(nullptr) { ///////////////////////////////////////////////////////////////////////////////////////////// // Code for heat equation starts here // ///////////////////////////////////////////////////////////////////////////////////////////// - + gsFileData fd(fn); if (this->id() == 0) gsInfo << "Loaded file " << fd.lastPath() << "\n"; fd.getId(0, mp); // id=0: Multipatch domain - basisH = gsMultiBasis(mp); - basisL = gsMultiBasis(mp); + basesH = gsMultiBasis(mp); + basesL = gsMultiBasis(mp); fd.getId(1, f); // id=1: right-hand side function if (this->id() == 0) gsInfo << "Source function " << f << "\n"; @@ -142,63 +143,63 @@ class gsXBraid_app : public gsXBraid< gsVector > this->SetStorage(Topt.getInt("numStorage")); this->SetTemporalNorm(Topt.getInt("norm")); - if (Topt.getInt("tol") == 1) - this->SetAbsTol(Topt.getReal("absTol")); - else - this->SetRelTol(Topt.getReal("relTol")); - - if (Topt.getSwitch("fmg")) this->SetFMG(); + if (Topt.getSwitch("fmg")) this->SetFMG(); if (Topt.getSwitch("incrMaxLevels")) this->SetIncrMaxLevels(); if (Topt.getSwitch("periodic")) this->SetPeriodic(1); else this->SetPeriodic(0); if (Topt.getSwitch("refine")) this->SetRefine(1); else this->SetRefine(0); if (Topt.getSwitch("sequential")) this->SetSeqSoln(1); else this->SetSeqSoln(0); if (Topt.getSwitch("skip")) this->SetSkip(1); else this->SetSkip(0); if (Topt.getSwitch("spatial")) this->SetSpatialCoarsenAndRefine(); + if (Topt.getSwitch("tol")) this->SetAbsTol(Topt.getReal("absTol")); + else this->SetRelTol(Topt.getReal("relTol")); fd.getId(7, Sopt); // id=6: spatial solver options if (this->id() == 0) gsInfo << "Spatial solver options:\n" << Sopt << "\n"; - - // Elevate and p-refine the basis to order k + numElevate - // where k is the highest degree in the basisH + + int numLevels = 9; // todo!!!! + hp = gsMatrix<>::Zero(numLevels-1); + + // Elevate and p-refine the bases to order k + numElevate + // where k is the highest degree in the basesH if ( numElevate > -1 ) { // Find maximum degree with respect to all the variables - int tmp = basisH.maxDegree(0); + int tmp = basesH.maxDegree(0); for (short_t j = 1; j < mp.parDim(); ++j ) - if ( tmp < basisH.maxDegree(j) ) - tmp = basisH.maxDegree(j); + if ( tmp < basesH.maxDegree(j) ) + tmp = basesH.maxDegree(j); // Elevate all degrees uniformly tmp += numElevate; - basisH.setDegree(tmp); - basisL.setDegree(tmp); + basesH.setDegree(tmp); + basesL.setDegree(tmp); } - // Increase and p-refine the basis + // Increase and p-refine the bases if (numIncrease > 0) { - basisH.degreeIncrease(numIncrease); - basisL.degreeIncrease(numIncrease); + basesH.degreeIncrease(numIncrease); + basesL.degreeIncrease(numIncrease); } - // h-refine the basis + // h-refine the bases for (int i = 0; i < numRefine; ++i) { - basisH.uniformRefine(); - basisL.uniformRefine(); + basesH.uniformRefine(); + basesL.uniformRefine(); } - // Set the basis - K.setIntegrationElements(basisH); - M.setIntegrationElements(basisH); + // Set the bases + K.setIntegrationElements(basesH); + M.setIntegrationElements(basesH); // Set the geometry map geometryMap G_K = K.getMap(mp); geometryMap G_M = M.getMap(mp); // Set the discretization space - space u_K = K.getSpace(basisH); - space u_M = M.getSpace(basisH); + space u_K = K.getSpace(basesH); + space u_M = M.getSpace(basesH); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); u_K.addBc( bc.get("Dirichlet") ); @@ -220,6 +221,30 @@ class gsXBraid_app : public gsXBraid< gsVector > variable g_Neumann = K.getBdrFunction(); K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); + // Initialize the solver + if (Sopt.getInt("numLevels") == 0) + { + // Single grid solver + m_solver_old = new solver_old(); + m_solver_old->setMaxIterations(Sopt.getInt("maxIter")); + m_solver_old->setTolerance(Sopt.getReal("tol")); + } else + { + // Multigrid solver + m_solver = new solver_mg(mp, basesL, bc); + m_solver->setMaxIter(Sopt.getInt("maxIter")); + m_solver->setTolerance(Sopt.getReal("tol")); + m_solver->setNumLevels(Sopt.getInt("numLevels")); + m_solver->setNumSmoothing(Sopt.getInt("numSmoothing")); + m_solver->setTypeBCHandling(Sopt.getInt("bcHandling")); + m_solver->setTypeCycle_h(Sopt.getInt("cycle_h")); + m_solver->setTypeCycle_p(Sopt.getInt("cycle_p")); + m_solver->setTypeLumping(Sopt.getInt("lumping")); + m_solver->setTypeProjection(Sopt.getInt("projection")); + m_solver->setTypeSmoother(Sopt.getInt("smoother")); + m_solver->setTypeSolver(Sopt.getInt("solver")); + } + if (this->id() == 0) { gsStopwatch clock; clock.restart(); @@ -234,8 +259,9 @@ class gsXBraid_app : public gsXBraid< gsVector > for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) sol = m_solver->compute(M.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*K.matrix())*sol); + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*K.matrix())*sol, + sol); break; case gsXBraid_typeMethod::BE_BE: @@ -245,8 +271,9 @@ class gsXBraid_app : public gsXBraid< gsVector > // Compute the system for the timestep i (rhs is assumed constant wrt time) sol = m_solver->compute(M.matrix() + tstep*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix())*sol); + ).solveWithGuess(tstep*K.rhs() + + (M.matrix())*sol, + sol); break; case gsXBraid_typeMethod::CN_CN: @@ -257,8 +284,9 @@ class gsXBraid_app : public gsXBraid< gsVector > // Compute the system for the timestep i (rhs is assumed constant wrt time) sol = m_solver->compute(M.matrix() + tstep*0.5*K.matrix() - ).solve(tstep*K.rhs() + - (M.matrix()-tstep*0.5*K.matrix())*sol); + ).solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*sol, + sol); break; default: @@ -283,7 +311,8 @@ class gsXBraid_app : public gsXBraid< gsVector > /// Destructor virtual ~gsXBraid_app() { - delete m_solver; + if(m_solver_old) delete m_solver_old; + if(m_solver) delete m_solver; } /// Creates instance from command line argument @@ -303,7 +332,7 @@ class gsXBraid_app : public gsXBraid< gsVector > index_t numSteps = 40; index_t typeMethod = (index_t)gsXBraid_typeMethod::CN_BE; T tfinal = 0.1; - + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); // Problem parameters @@ -336,7 +365,7 @@ class gsXBraid_app : public gsXBraid< gsVector > override #endif { - gsVector* u = new gsVector(M.numDofs()); + gsMatrix* u = new gsMatrix(M.numDofs(), 1); if (t != tstart) { // Intermediate solution @@ -354,23 +383,23 @@ class gsXBraid_app : public gsXBraid< gsVector > braid_Int Step(braid_Vector u, braid_Vector ustop, braid_Vector fstop, - BraidStepStatus &pstatus) + BraidStepStatus &status) #if __cplusplus >= 201103L || _MSC_VER >= 1600 override #endif { - gsVector* u_ptr = (gsVector*) u; - gsVector* ustop_ptr = (gsVector*) ustop; + gsMatrix* u_ptr = (gsMatrix*) u; + gsMatrix* ustop_ptr = (gsMatrix*) ustop; // XBraid forcing if (fstop != NULL) { - gsVector* fstop_ptr = (gsVector*) fstop; + gsMatrix* fstop_ptr = (gsMatrix*) fstop; *u_ptr += *fstop_ptr; } // Get time step information std::pair time = - static_cast(pstatus).timeInterval(); + static_cast(status).timeInterval(); T tstep(time.second - time.first); switch((gsXBraid_typeMethod)typeMethod) { @@ -383,7 +412,7 @@ class gsXBraid_app : public gsXBraid< gsVector > break; case gsXBraid_typeMethod::FE_BE: - if (static_cast(pstatus).level() == 0) { + if (static_cast(status).level() == 0) { // Forward Euler method (fine grid) *u_ptr = m_solver->compute(M.matrix() ).solveWithGuess(tstep*K.rhs() + @@ -418,7 +447,7 @@ class gsXBraid_app : public gsXBraid< gsVector > break; case gsXBraid_typeMethod::CN_BE: - if (static_cast(pstatus).level() == 0) { + if (static_cast(status).level() == 0) { *u_ptr = m_solver->compute(M.matrix() + tstep*0.5*K.matrix() ).solveWithGuess(tstep*K.rhs() + @@ -439,13 +468,13 @@ class gsXBraid_app : public gsXBraid< gsVector > } // Carry out adaptive refinement in time - if (static_cast(pstatus).level() == 0) { - braid_Real error = static_cast(pstatus).error(); + if (static_cast(status).level() == 0) { + braid_Real error = static_cast(status).error(); if (error != braid_Real(-1.0)) { braid_Int rfactor = (braid_Int) std::ceil( std::sqrt( error / 1e-3) ); - pstatus.SetRFactor(rfactor); + status.SetRFactor(rfactor); } else - pstatus.SetRFactor(1); + status.SetRFactor(1); } return braid_Int(0); @@ -464,15 +493,15 @@ class gsXBraid_app : public gsXBraid< gsVector > /// Handles access for input/output braid_Int Access(braid_Vector u, - BraidAccessStatus &astatus) + BraidAccessStatus &status) #if __cplusplus >= 201103L || _MSC_VER >= 1600 override #endif { - if(static_cast(astatus).done() && - static_cast(astatus).timeIndex() == - static_cast(astatus).times()) { - gsVector* u_ptr = (gsVector*) u; + if (static_cast(status).done() && + static_cast(status).timeIndex() == + static_cast(status).times()) { + gsMatrix* u_ptr = (gsMatrix*) u; gsInfo << "norm of the solution = " << u_ptr->norm() << std::endl; } return braid_Int(0); @@ -486,9 +515,13 @@ class gsXBraid_app : public gsXBraid< gsVector > override #endif { - gsInfo << "Coarsen\n"; - gsVector *fu_ptr = (gsVector*) fu; - gsVector* cu = new gsVector(); + gsInfo << "Coarsen on level = " + << static_cast(status).level() + << " of " + << static_cast(status).levels() + << "\n"; + gsMatrix *fu_ptr = (gsMatrix*) fu; + gsMatrix* cu = new gsMatrix(); *cu = *fu_ptr; *cu_ptr = (braid_Vector) cu; return braid_Int(0); @@ -502,9 +535,13 @@ class gsXBraid_app : public gsXBraid< gsVector > override #endif { - gsInfo << "Refine\n"; - gsVector *cu_ptr = (gsVector*) cu; - gsVector* fu = new gsVector(); + gsInfo << "Refine on level = " + << static_cast(status).level() + << " of " + << static_cast(status).levels() + << "\n"; + gsMatrix *cu_ptr = (gsMatrix*) cu; + gsMatrix* fu = new gsMatrix(); *fu = *cu_ptr; *fu_ptr = (braid_Vector) fu; return braid_Int(0); diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml index f6868aa6da..5e1f8646ac 100644 --- a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -22,12 +22,14 @@ - 0 1 0 2 0 3 + 0 2 + 0 3 + 0 4 - 0 4 + 0 1 @@ -58,14 +60,13 @@ + - - @@ -74,22 +75,27 @@ - + + - - - + + - + + + + - - + + + + From 475c1abef9daafe3410db5efcf07fc88e4ad7bff Mon Sep 17 00:00:00 2001 From: roeltielen Date: Fri, 28 May 2021 11:14:13 +0200 Subject: [PATCH 034/174] [WIP] p-multigrid and MGRIT --- .../gsXBraid/examples/gsXBraidMultigrid.h | 363 +++++++++++------- .../examples/xbraid_heatEquation_example.cpp | 227 +++++------ .../filedata/pde/heat2d_square_ibvp1.xml | 16 +- 3 files changed, 356 insertions(+), 250 deletions(-) diff --git a/extensions/gsXBraid/examples/gsXBraidMultigrid.h b/extensions/gsXBraid/examples/gsXBraidMultigrid.h index 931f27a5b4..aec574d164 100644 --- a/extensions/gsXBraid/examples/gsXBraidMultigrid.h +++ b/extensions/gsXBraid/examples/gsXBraidMultigrid.h @@ -21,7 +21,7 @@ namespace gismo { int typeLumping; int typeProjection; int typeSmoother; - int typeSolver; + gsMatrix<> hp; T tol; public: @@ -36,7 +36,6 @@ namespace gismo { typeLumping(1), typeProjection(1), typeSmoother(1), - typeSolver(1), tol(1e-8) {} @@ -46,8 +45,17 @@ namespace gismo { void setTolerance(T tol) { this->tol = tol; } - void setNumLevels(int numLevels) - { this->numLevels = numLevels; } + void setNumLevels(int numLevels, int typeProjection, int numDegree) + { + if(typeProjection == 1) + { + this->numLevels = numLevels - numDegree + 2; + } + else + { + this->numLevels = numLevels; + } + } void setNumSmoothing(int numSmoothing) { this->numSmoothing = numSmoothing; } @@ -70,22 +78,61 @@ namespace gismo { void setTypeSmoother(int typeSmoother) { this->typeSmoother = typeSmoother; } - void setTypeSolver(int typeSolver) - { this->typeSolver = typeSolver; } + void setCoarsening(gsMatrix<> hp) + { this->hp = hp; } + + virtual gsXBraidMultigridBase& compute(const gsSparseMatrix& mat, const T tstep, const int& numDegree, index_t typeMethod) + { + // Get arguments explicitly + gsMatrix x = gsMatrix<>::Zero(mat.rows(),1); + gsMatrix b = gsMatrix<>::Zero(mat.rows(),1); + gsFunctionExpr<> rhs("1",2); + int iterTot = 1; + int typeMultigrid = 2; + int typeCoarseOperator = 1; + + /// @brief Set-up p-multigrid solver + setup(rhs, + x, + b, + iterTot, + numLevels, + numDegree, + typeMultigrid, + hp, + typeCoarseOperator, + tstep, + typeMethod); + - virtual gsXBraidMultigridBase& compute(const gsSparseMatrix&) - { return *this; } + return *this; } virtual gsMatrix solveWithGuess(const gsMatrix& b, const gsMatrix& x0) { + // Get arguments explicitly gsMatrix x(x0); - solvecoarse(b, x, 1); + x = x0; + + gsFunctionExpr<> rhs("1",2); + int iterTot = 1; + int typeMultigrid = 2; + int typeCoarseOperator = 1; + + /// @brief Apply p-multigrid solver to given right-hand side on level l + solve(rhs, + x, + b, + iterTot, + numLevels, + typeMultigrid, + hp, + typeCoarseOperator); return x; } /// @brief Apply p-multigrid solver to given right-hand side on level l - virtual void solve(const gsMatrix & rhs, + virtual void solveMG(const gsMatrix & rhs, std::vector > > m_bases, gsMatrix& x, const int& numLevels, @@ -100,11 +147,10 @@ namespace gismo { const gsMatrix& hp) { if ( numLevels == 1) - { + { solvecoarse(rhs, x, numLevels); return; - } - + } if (hp(std::max(numLevels-2,0),0) == 0 ) { @@ -119,7 +165,7 @@ namespace gismo { coarseCorr.setZero(coarseRes.rows(),1); for( int j = 0 ; j < (typeCycle_p == 2 ? 2 : 1) ; j++) { - solve(coarseRes, m_bases, coarseCorr, numLevels-1, + solveMG(coarseRes, m_bases, coarseCorr, numLevels-1, bcInfo, mp, m_prolongation_P, m_restriction_P, m_prolongation_M, m_restriction_M, @@ -147,7 +193,7 @@ namespace gismo { coarseCorr.setZero(coarseRes.rows(),1); for( int i = 0 ; i < (typeCycle_h == 2 ? 2 : 1) ; i++) { - solve(coarseRes, m_bases, coarseCorr, numLevels-1, + solveMG(coarseRes, m_bases, coarseCorr, numLevels-1, bcInfo, mp, m_prolongation_P, m_restriction_P, m_prolongation_M, m_restriction_M, @@ -163,20 +209,26 @@ namespace gismo { } } - /// @brief Setup p-multigrid to given linear system - virtual void setup(const gsMatrix & rhs, - std::vector > > m_bases, - gsMatrix& x, - const int& numLevels, - gsBoundaryConditions bcInfo, - gsMultiPatch mp, - std::vector >& m_prolongation_P, - std::vector >& m_restriction_P, - std::vector >& m_prolongation_M, - std::vector >& m_restriction_M, - std::vector >& m_prolongation_H, - std::vector >& m_restriction_H, - const gsMatrix& hp) {} + virtual void setup(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& numDegree, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator, + T tstep, + index_t typeMethod){} + + virtual void solve(const gsFunctionExpr & rhs, + gsMatrix& x, + gsMatrix f, + const int& iterTot, + const int& numLevels, + const int& typeMultigrid, + const gsMatrix& hp, + const int& typeCoarseOperator){} /// @brief Apply fixed number of smoothing steps (pure virtual method) virtual void presmoothing(const gsMatrix& rhs, @@ -342,7 +394,7 @@ namespace gismo { * restriction operators are generated internally. Therefore, a * problem-specific assembler has to be passed as template argument. */ - template + template struct gsXBraidMultigrid : public gsXBraidMultigridBase { private: @@ -416,9 +468,6 @@ namespace gismo { /// std::vector of std::vector of shift objects std::vector < std::vector< int > > m_shift; - /// std::vector of assembler objects - std::vector m_assembler; - public: // Constructor @@ -445,9 +494,8 @@ namespace gismo { const int& typeMultigrid, const gsMatrix& hp, const int& typeCoarseOperator, - const gsFunctionExpr coeff_diff, - const gsFunctionExpr coeff_conv, - const gsFunctionExpr coeff_reac) + T tstep, + index_t typeMethod) { for (int i = 1; i < numLevels; i++) { @@ -464,57 +512,72 @@ namespace gismo { m_bases.back()->degreeIncrease(); break; } } - - // Generate sequence of assembler objects and assemble - for (typename std::vector > >::iterator it = m_bases.begin(); - it != m_bases.end(); ++it) - { - m_assembler.push_back(Assembler(*m_mp_ptr, - *(*it).get(), - *m_bcInfo_ptr, - rhs, - coeff_diff, - coeff_conv, - coeff_reac, - (Base::typeBCHandling == 1 ? - dirichlet::elimination : - dirichlet::nitsche), - iFace::glue)); - } - - // Resize vector of operators + + // Generate sequence of matrix K and M m_operator.resize(numLevels); + gsStopwatch clock; + //gsInfo << "|| Multigrid hierarchy ||" <degree() << ", Ndof: " << m_bases[i]->totalSize() <::geometryMap geometryMap; + typedef typename gsExprAssembler::variable variable; + typedef typename gsExprAssembler::space space; + typedef typename gsExprAssembler::solution solution; + + gsExprAssembler K, M; + + // Set the bases + K.setIntegrationElements(*m_bases[i]); + M.setIntegrationElements(*m_bases[i]); + + // Set the geometry map + geometryMap G_K = K.getMap(*m_mp_ptr); + geometryMap G_M = M.getMap(*m_mp_ptr); + + // Set the discretization space + space u_K = K.getSpace(*m_bases[i]); + space u_M = M.getSpace(*m_bases[i]); + u_K.setInterfaceCont(0); + u_M.setInterfaceCont(0); + u_K.addBc( m_bcInfo_ptr->get("Dirichlet") ); + u_M.addBc( m_bcInfo_ptr->get("Dirichlet") ); + + // Set the source term + variable ff_K = K.getCoeff(rhs, G_K); + variable ff_M = M.getCoeff(rhs, G_M); + + // Initialize and assemble the system matrix + K.initSystem(); + K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); + + // Initialize and assemble the mass matrix + M.initSystem(); + M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); + + + m_operator[i] = M.matrix() + tstep*K.matrix(); + switch(typeMethod) + { + case 0: m_operator[i] = M.matrix(); break; + case 1: m_operator[i] = M.matrix() + tstep*K.matrix(); break; + case 2: m_operator[i] = M.matrix() + 0.5*tstep*K.matrix(); + } + + } + real_t Time_Assembly = clock.stop(); + + + // Resize vector of operators m_prolongation_P.resize(numLevels-1); m_prolongation_M.resize(numLevels-1); m_prolongation_H.resize(numLevels-1); m_restriction_P.resize(numLevels-1); m_restriction_M.resize(numLevels-1); m_restriction_H.resize(numLevels-1); - - // Assemble operators at finest level - gsStopwatch clock; - gsInfo << "|| Multigrid hierarchy ||" <degree() << ", Ndof: " << m_bases[i]->totalSize() <degree() <totalSize() <::Random(m_operator[numLevels-1].rows(),1); - } - - gsMatrix b; - Base::typeSolver == 1 ? b = m_assembler.back().rhs() : b = f; - + gsMatrix b = f; // Determine residual and L2 error real_t r0 = (m_operator[numLevels-1]*x - b).norm(); @@ -781,14 +836,16 @@ namespace gismo { // Solve with p-multigrid method real_t r_old = r0; clock.restart(); - while( (Base::typeSolver == 1 || Base::typeSolver == 5) ? r/r0 > Base::tol && iter < Base::maxIter : iter < 2) + // Adjusted stopping criterion!! + while( r/b.norm() > Base::tol && iter < Base::maxIter ) { // Call solver from base class - Base::solve(b, m_bases, x, numLevels, + Base::solveMG(b, m_bases, x, numLevels, *m_bcInfo_ptr, *m_mp_ptr, m_prolongation_P, m_restriction_P, m_prolongation_M, m_restriction_M, m_prolongation_H, m_restriction_H, hp); + r = (m_operator[numLevels-1]*x - b).norm(); if ( r_old < r) { @@ -799,10 +856,11 @@ namespace gismo { iter++; } real_t Time_Solve = clock.stop(); - gsInfo << "\n|| Solver information || " <& x, const int& numLevels) { - gsInfo << "Coarse solver is applied! " < & fineRes, const gsMatrix& hp) { - gsInfo << "Residual before presmoothing: " << (rhs-m_operator[numLevels-1]*x).norm() << " at level " << numLevels < - struct gsXBraidMultigrid : public gsXBraidMultigridBase - { - // Default constructor - gsXBraidMultigrid() +// Create the subspace corrected mass smoother +gsPreconditionerOp<>::Ptr setupSubspaceCorrectedMassSmoother(const gsSparseMatrix<>& matrix, const gsMultiBasis<>& mb, const gsBoundaryConditions<>& bc, const gsOptionList& opt, const int &typeBCHandling) +{ + const short_t dim = mb.topology().dim(); + + // Setup dof mapper + gsDofMapper dm; + mb.getMapper( + typeBCHandling == 1 ? (dirichlet::strategy)opt.askInt("DirichletStrategy",11) : (dirichlet::strategy)opt.askInt("DirichletStrategy",14), + (iFace ::strategy)opt.askInt("InterfaceStrategy", 1), + bc, + dm, + 0 + ); + const index_t nTotalDofs = dm.freeSize(); + + // Decompose the whole domain into components + std::vector< std::vector > components = mb.topology().allComponents(true); + const index_t nr_components = components.size(); + + // Setup Dirichlet boundary conditions + gsBoundaryConditions<> dir_bc; + for( index_t ps=0; ps < 2*dim; ++ps ) + dir_bc.addCondition( 0, 1+ps, condition_type::dirichlet, NULL ); + + // Setup transfer matrices and local preconditioners + std::vector< gsSparseMatrix > transfers; + transfers.reserve(nr_components); + std::vector< gsLinearOperator<>::Ptr > ops; + ops.reserve(nr_components); + + for (index_t i=0; i indices; + std::vector::uPtr> bases = mb.componentBasis_withIndices(components[i],dm,indices,true); + index_t sz = indices.rows(); + gsSparseEntries<> se; + se.reserve(sz); + for (index_t i=0; i transfer(nTotalDofs,sz); + transfer.setFrom(se); + if (sz>0) + { + if (bases[0]->dim() == dim) + { + GISMO_ASSERT ( bases.size() == 1, "Only one basis is expected for each patch." ); + ops.push_back( + gsPatchPreconditionersCreator<>::subspaceCorrectedMassSmootherOp( + *(bases[0]), + dir_bc, + gsOptionList(), + opt.getReal("Scaling") + ) + ); + } + else + { + gsSparseMatrix<> mat = transfer.transpose() * matrix * transfer; + ops.push_back( makeSparseCholeskySolver(mat) ); + } + transfers.push_back(give(transfer)); + } } - }; + return gsPreconditionerFromOp<>::make(makeMatrixOp(matrix), gsAdditiveOp<>::make(transfers, ops)); +} + } // namespace gismo diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index 0b4c6586d0..43c8139306 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -60,16 +60,11 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // Solution gsMatrix sol; - - // Single-grid solver - typedef typename gsSparseSolver::CGDiagonal solver_old; - solver_old* m_solver_old; // Multigrid solver - typedef gsXBraidMultigrid::LU , gsCDRAssembler > solver_mg; - solver_mg* m_solver; - gsMatrix hp; - + typedef gsXBraidMultigrid::LU > solver_mg; + std::vector< solver_mg* > m_solver; + typedef typename gsExprAssembler::geometryMap geometryMap; typedef typename gsExprAssembler::variable variable; typedef typename gsExprAssembler::space space; @@ -95,9 +90,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > tstart(tstart), tstop(tstop), tstep( (tstop-tstart)/numSteps ), - K(1,1), M(1,1), - m_solver_old(nullptr), - m_solver(nullptr) + K(1,1), M(1,1) { ///////////////////////////////////////////////////////////////////////////////////////////// // Code for heat equation starts here // @@ -155,40 +148,58 @@ class gsXBraid_app : public gsXBraid< gsMatrix > fd.getId(7, Sopt); // id=6: spatial solver options if (this->id() == 0) gsInfo << "Spatial solver options:\n" << Sopt << "\n"; + + std::string typeCoarsening = Sopt.getString("coarseStrategy"); + gsMatrix<> hp = gsMatrix<>::Zero(Sopt.getInt("numLevels")-1,1); + + // Read string from command line + real_t numRefH = 0; + real_t numRefP = 0; + real_t numRefZ = 0; - int numLevels = 9; // todo!!!! - hp = gsMatrix<>::Zero(numLevels-1); - - // Elevate and p-refine the bases to order k + numElevate - // where k is the highest degree in the basesH - if ( numElevate > -1 ) + // Convert input string to array + for( int i = 0; i < Sopt.getInt("numLevels")-1 ; ++i) { - // Find maximum degree with respect to all the variables - int tmp = basesH.maxDegree(0); - for (short_t j = 1; j < mp.parDim(); ++j ) - if ( tmp < basesH.maxDegree(j) ) - tmp = basesH.maxDegree(j); - - // Elevate all degrees uniformly - tmp += numElevate; - basesH.setDegree(tmp); - basesL.setDegree(tmp); + if( typeCoarsening[i] == 'h') + { + hp(i,0) = 1; + numRefH = numRefH + 1; + } + else if( typeCoarsening[i] == 'p') + { + hp(i,0) = 0; + numRefP = numRefP + 1; + } + else + { + hp(i,0) = 2; + numRefZ = numRefZ + 1; + } } - // Increase and p-refine the bases - if (numIncrease > 0) + // Apply refinement in p for coarse level + if((numRefP + numRefZ) == numIncrease ) + { + basesL.degreeReduce(1); + } + else { - basesH.degreeIncrease(numIncrease); - basesL.degreeIncrease(numIncrease); + basesL.degreeIncrease(numIncrease-numRefP-numRefZ-1); } - - // h-refine the bases - for (int i = 0; i < numRefine; ++i) + + // Apply refinement in h for coarse and fine level + for (int i = 0; i < numRefine - numRefH - numRefZ; ++i) + { + basesL.uniformRefine(); + } + for (int i = 0; i < numRefine ; ++i) { basesH.uniformRefine(); - basesL.uniformRefine(); - } - + } + + // Apply refinement in p for fine level + basesH.degreeIncrease(numIncrease-1); + // Set the bases K.setIntegrationElements(basesH); M.setIntegrationElements(basesH); @@ -221,36 +232,51 @@ class gsXBraid_app : public gsXBraid< gsMatrix > variable g_Neumann = K.getBdrFunction(); K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); - // Initialize the solver - if (Sopt.getInt("numLevels") == 0) - { - // Single grid solver - m_solver_old = new solver_old(); - m_solver_old->setMaxIterations(Sopt.getInt("maxIter")); - m_solver_old->setTolerance(Sopt.getReal("tol")); - } else + // Determine MGRIT levels a priori + int numMGRITLevels = 1; + int StepsLevel = numSteps; + for(int i = 1 ; i < 10000; i++){ + StepsLevel = StepsLevel/Topt.getInt("CFactor"); + if(StepsLevel < Topt.getInt("minCLevel")) + break; + numMGRITLevels = numMGRITLevels + 1; + } + + m_solver.resize(numMGRITLevels); + real_t tstep_level = tstep; + for(int i = 0 ; i < numMGRITLevels ; i++) { - // Multigrid solver - m_solver = new solver_mg(mp, basesL, bc); - m_solver->setMaxIter(Sopt.getInt("maxIter")); - m_solver->setTolerance(Sopt.getReal("tol")); - m_solver->setNumLevels(Sopt.getInt("numLevels")); - m_solver->setNumSmoothing(Sopt.getInt("numSmoothing")); - m_solver->setTypeBCHandling(Sopt.getInt("bcHandling")); - m_solver->setTypeCycle_h(Sopt.getInt("cycle_h")); - m_solver->setTypeCycle_p(Sopt.getInt("cycle_p")); - m_solver->setTypeLumping(Sopt.getInt("lumping")); - m_solver->setTypeProjection(Sopt.getInt("projection")); - m_solver->setTypeSmoother(Sopt.getInt("smoother")); - m_solver->setTypeSolver(Sopt.getInt("solver")); + m_solver[i] = new solver_mg(mp, basesL, bc); + m_solver[i]->setMaxIter(Sopt.getInt("maxIter")); + m_solver[i]->setTolerance(Sopt.getReal("tol")); + m_solver[i]->setNumLevels(Sopt.getInt("numLevels"),Sopt.getInt("projection"),numIncrease); + m_solver[i]->setNumSmoothing(Sopt.getInt("numSmoothing")); + m_solver[i]->setTypeBCHandling(Sopt.getInt("bcHandling")); + m_solver[i]->setTypeCycle_h(Sopt.getInt("cycle_h")); + m_solver[i]->setTypeCycle_p(Sopt.getInt("cycle_p")); + m_solver[i]->setTypeLumping(Sopt.getInt("lumping")); + m_solver[i]->setTypeProjection(Sopt.getInt("projection")); + m_solver[i]->setTypeSmoother(Sopt.getInt("smoother")); + m_solver[i]->setCoarsening(hp); + if(typeMethod > 2 && i == 0) + { + m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,typeMethod); + } + else + { + // Apple Backward Euler at coarser levels (FE_BE and CN_BE) + m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,1); + } + tstep_level = tstep_level*Topt.getInt("CFactor"); } if (this->id() == 0) { + gsStopwatch clock; clock.restart(); sol.setZero(M.numDofs()); - + switch((gsXBraid_typeMethod)typeMethod) { case gsXBraid_typeMethod::FE_FE: case gsXBraid_typeMethod::FE_BE: @@ -258,8 +284,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = m_solver->compute(M.matrix() - ).solveWithGuess(tstep*K.rhs() + + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*K.matrix())*sol, sol); break; @@ -267,24 +292,17 @@ class gsXBraid_app : public gsXBraid< gsMatrix > case gsXBraid_typeMethod::BE_BE: // Backward Euler method - for ( int i = 1; i<=numSteps; ++i) // for all timesteps + for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = m_solver->compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix())*sol, - sol); - break; + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix())*sol, sol); + break; case gsXBraid_typeMethod::CN_CN: case gsXBraid_typeMethod::CN_BE: // Crank-Nicholson method - for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) - sol = m_solver->compute(M.matrix() + - tstep*0.5*K.matrix() - ).solveWithGuess(tstep*K.rhs() + + sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*0.5*K.matrix())*sol, sol); break; @@ -295,7 +313,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > gsInfo << "wall time = " << clock.stop() << "\n" << "L2 norm of the solution = " << sol.norm() << "\n"; - + // gsExprEvaluator ev(M); // solution u_sol = M.getSolution(u_M, sol); // variable u_ex = ev.getVariable(ms, G_M); @@ -311,8 +329,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > /// Destructor virtual ~gsXBraid_app() { - if(m_solver_old) delete m_solver_old; - if(m_solver) delete m_solver; + } /// Creates instance from command line argument @@ -330,7 +347,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // Temporal discretisation parameters index_t numSteps = 40; - index_t typeMethod = (index_t)gsXBraid_typeMethod::CN_BE; + index_t typeMethod = (index_t)gsXBraid_typeMethod::BE_BE; T tfinal = 0.1; gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); @@ -401,12 +418,11 @@ class gsXBraid_app : public gsXBraid< gsMatrix > std::pair time = static_cast(status).timeInterval(); T tstep(time.second - time.first); - + switch((gsXBraid_typeMethod)typeMethod) { case gsXBraid_typeMethod::FE_FE: // Forward Euler method (all grids) - *u_ptr = m_solver->compute(M.matrix() - ).solveWithGuess(tstep*K.rhs() + + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*K.matrix())*(*u_ptr), *ustop_ptr); break; @@ -414,50 +430,37 @@ class gsXBraid_app : public gsXBraid< gsMatrix > case gsXBraid_typeMethod::FE_BE: if (static_cast(status).level() == 0) { // Forward Euler method (fine grid) - *u_ptr = m_solver->compute(M.matrix() - ).solveWithGuess(tstep*K.rhs() + + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*K.matrix())*(*u_ptr), *ustop_ptr); } else { // Backward Euler method (coarse grids) - *u_ptr = m_solver->compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix())*(*u_ptr), *ustop_ptr); } break; - case gsXBraid_typeMethod::BE_BE: + case gsXBraid_typeMethod::BE_BE: { // Backward Euler method (all grids) - *u_ptr = m_solver->compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix())*(*u_ptr), - *ustop_ptr); - break; + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix())*(*u_ptr), *ustop_ptr); + } break; case gsXBraid_typeMethod::CN_CN: // Crank-Nicholson method (all grids) - *u_ptr = m_solver->compute(M.matrix() + - tstep*0.5*K.matrix() - ).solveWithGuess(tstep*K.rhs() + - (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), - *ustop_ptr); + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), + *ustop_ptr); break; case gsXBraid_typeMethod::CN_BE: if (static_cast(status).level() == 0) { - *u_ptr = m_solver->compute(M.matrix() + - tstep*0.5*K.matrix() - ).solveWithGuess(tstep*K.rhs() + + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), *ustop_ptr); } else { // Backward Euler method (coarse grids) - *u_ptr = m_solver->compute(M.matrix() + - tstep*K.matrix() - ).solveWithGuess(tstep*K.rhs() + + *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + (M.matrix())*(*u_ptr), *ustop_ptr); } @@ -515,11 +518,11 @@ class gsXBraid_app : public gsXBraid< gsMatrix > override #endif { - gsInfo << "Coarsen on level = " - << static_cast(status).level() - << " of " - << static_cast(status).levels() - << "\n"; + // gsInfo << "Coarsen on level = " + // << static_cast(status).level() + // << " of " + // << static_cast(status).levels() + // << "\n"; gsMatrix *fu_ptr = (gsMatrix*) fu; gsMatrix* cu = new gsMatrix(); *cu = *fu_ptr; @@ -535,11 +538,11 @@ class gsXBraid_app : public gsXBraid< gsMatrix > override #endif { - gsInfo << "Refine on level = " - << static_cast(status).level() - << " of " - << static_cast(status).levels() - << "\n"; + // gsInfo << "Refine on level = " + // << static_cast(status).level() + // << " of " + // << static_cast(status).levels() + // << "\n"; gsMatrix *cu_ptr = (gsMatrix*) cu; gsMatrix* fu = new gsMatrix(); *fu = *cu_ptr; diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml index 5e1f8646ac..97f29ab929 100644 --- a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -22,14 +22,14 @@ + 0 1 0 2 - 0 3 - 0 4 + 0 3 - 0 1 + 0 4 @@ -76,7 +76,7 @@ - + @@ -87,16 +87,14 @@ - - + - - - + + From 6e2d78eefc70b189b76e44c21bac242a849d1890 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Mon, 27 Sep 2021 16:40:09 +0200 Subject: [PATCH 035/174] Code cleanup --- .../examples/xbraid_heatEquation_example.cpp | 132 ++++++++++-------- .../filedata/pde/heat2d_square_ibvp1.xml | 4 +- 2 files changed, 73 insertions(+), 63 deletions(-) diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index 43c8139306..b19a14255d 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -27,7 +27,7 @@ namespace gismo { BE_BE = 1, // backward Euler (all grids) CN_CN = 2, // Crank-Nicholson (all grids) FE_BE = 3, // forward Euler (fine grid), backward Euler (coarser grids) - CN_BE = 4 // Crank-Nicholson (fine grid), backward Euler (coarser grids) + CN_BE = 4 // Crank-Nicholson (fine grid), backward Euler (coarser grids) }; /** @@ -47,29 +47,29 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // Spatial discretizations gsMultiPatch mp; gsMultiBasis basesH, basesL; - + // Boundary conditions gsBoundaryConditions bc; // Assembler options gsOptionList Aopt, Sopt, Topt; - + // Expression assembler gsExprAssembler K, M; gsFunctionExpr f, u0, ms; - + // Solution gsMatrix sol; - + // Multigrid solver typedef gsXBraidMultigrid::LU > solver_mg; - std::vector< solver_mg* > m_solver; - + std::vector< solver_mg* > m_solver; + typedef typename gsExprAssembler::geometryMap geometryMap; typedef typename gsExprAssembler::variable variable; typedef typename gsExprAssembler::space space; typedef typename gsExprAssembler::solution solution; - + public: /// Contructor gsXBraid_app(const gsMpiComm& comm, @@ -95,26 +95,26 @@ class gsXBraid_app : public gsXBraid< gsMatrix > ///////////////////////////////////////////////////////////////////////////////////////////// // Code for heat equation starts here // ///////////////////////////////////////////////////////////////////////////////////////////// - + gsFileData fd(fn); if (this->id() == 0) gsInfo << "Loaded file " << fd.lastPath() << "\n"; fd.getId(0, mp); // id=0: Multipatch domain basesH = gsMultiBasis(mp); basesL = gsMultiBasis(mp); - + fd.getId(1, f); // id=1: right-hand side function if (this->id() == 0) gsInfo << "Source function " << f << "\n"; - + fd.getId(2, bc); // id=2: boundary conditions if (this->id() == 0) gsInfo << "Boundary conditions:\n" << bc << "\n"; fd.getId(3, u0); // id=3: initial conditions if (this->id() == 0) gsInfo << "Initial conditions:\n" << u0 << "\n"; - + fd.getId(4, ms); // id=4: manufactured solution if (this->id() == 0) gsInfo << "Manufactured solution:\n" << ms << "\n"; - + fd.getId(5, Aopt); // id=5: assembler options if (this->id() == 0) gsInfo << "Assembler options:\n" << Aopt << "\n"; K.setOptions(Aopt); @@ -144,11 +144,11 @@ class gsXBraid_app : public gsXBraid< gsMatrix > if (Topt.getSwitch("skip")) this->SetSkip(1); else this->SetSkip(0); if (Topt.getSwitch("spatial")) this->SetSpatialCoarsenAndRefine(); if (Topt.getSwitch("tol")) this->SetAbsTol(Topt.getReal("absTol")); - else this->SetRelTol(Topt.getReal("relTol")); - + else this->SetRelTol(Topt.getReal("relTol")); + fd.getId(7, Sopt); // id=6: spatial solver options if (this->id() == 0) gsInfo << "Spatial solver options:\n" << Sopt << "\n"; - + std::string typeCoarsening = Sopt.getString("coarseStrategy"); gsMatrix<> hp = gsMatrix<>::Zero(Sopt.getInt("numLevels")-1,1); @@ -156,7 +156,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > real_t numRefH = 0; real_t numRefP = 0; real_t numRefZ = 0; - + // Convert input string to array for( int i = 0; i < Sopt.getInt("numLevels")-1 ; ++i) { @@ -177,16 +177,16 @@ class gsXBraid_app : public gsXBraid< gsMatrix > } } - // Apply refinement in p for coarse level + // Apply refinement in p for coarse level if((numRefP + numRefZ) == numIncrease ) { basesL.degreeReduce(1); - } + } else { - basesL.degreeIncrease(numIncrease-numRefP-numRefZ-1); + basesL.degreeIncrease(numIncrease-numRefP-numRefZ-1); } - + // Apply refinement in h for coarse and fine level for (int i = 0; i < numRefine - numRefH - numRefZ; ++i) { @@ -202,7 +202,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // Set the bases K.setIntegrationElements(basesH); - M.setIntegrationElements(basesH); + M.setIntegrationElements(basesH); // Set the geometry map geometryMap G_K = K.getMap(mp); @@ -236,12 +236,12 @@ class gsXBraid_app : public gsXBraid< gsMatrix > int numMGRITLevels = 1; int StepsLevel = numSteps; for(int i = 1 ; i < 10000; i++){ - StepsLevel = StepsLevel/Topt.getInt("CFactor"); - if(StepsLevel < Topt.getInt("minCLevel")) + StepsLevel = StepsLevel/Topt.getInt("CFactor"); + if(StepsLevel < Topt.getInt("minCLevel")) break; numMGRITLevels = numMGRITLevels + 1; - } - + } + m_solver.resize(numMGRITLevels); real_t tstep_level = tstep; for(int i = 0 ; i < numMGRITLevels ; i++) @@ -265,38 +265,39 @@ class gsXBraid_app : public gsXBraid< gsMatrix > else { // Apple Backward Euler at coarser levels (FE_BE and CN_BE) - m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,1); + m_solver[i]->compute(M.matrix(),tstep_level,numIncrease,1); } tstep_level = tstep_level*Topt.getInt("CFactor"); } - + + if (this->id() == 0) { - + gsStopwatch clock; clock.restart(); - + sol.setZero(M.numDofs()); - + switch((gsXBraid_typeMethod)typeMethod) { case gsXBraid_typeMethod::FE_FE: case gsXBraid_typeMethod::FE_BE: // Forward Euler method - + for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix()-tstep*K.matrix())*sol, sol); break; - + case gsXBraid_typeMethod::BE_BE: // Backward Euler method - + for ( int i = 1; i<=numSteps; ++i) // for all timesteps // Compute the system for the timestep i (rhs is assumed constant wrt time) sol = m_solver[0]->solveWithGuess(tstep*K.rhs() + (M.matrix())*sol, sol); break; - + case gsXBraid_typeMethod::CN_CN: case gsXBraid_typeMethod::CN_BE: // Crank-Nicholson method @@ -306,14 +307,13 @@ class gsXBraid_app : public gsXBraid< gsMatrix > (M.matrix()-tstep*0.5*K.matrix())*sol, sol); break; - + default: throw std::runtime_error("Unsupported time-stepping method"); } - + gsInfo << "wall time = " << clock.stop() << "\n" - << "L2 norm of the solution = " << sol.norm() << "\n"; - + << "L2 norm of the solution = " << sol.norm() << "\n"; // gsExprEvaluator ev(M); // solution u_sol = M.getSolution(u_M, sol); // variable u_ex = ev.getVariable(ms, G_M); @@ -324,6 +324,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // gsInfo << "L2 error of the solution = " << l2err << "\n" // << "H1 error of the solution = " << h1err << std::flush; } + } /// Destructor @@ -331,7 +332,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > { } - + /// Creates instance from command line argument static inline gsXBraid_app create(const gsMpiComm& comm, int argc, @@ -339,22 +340,22 @@ class gsXBraid_app : public gsXBraid< gsMatrix > { // Problem parameters std::string fn(XBRAID_DATA_DIR"pde/heat2d_square_ibvp1.xml"); - + // Spatial discretisation parameters index_t numRefine = 2; index_t numElevate = 0; index_t numIncrease = 0; - + // Temporal discretisation parameters index_t numSteps = 40; index_t typeMethod = (index_t)gsXBraid_typeMethod::BE_BE; T tfinal = 0.1; - + gsCmdLine cmd("Tutorial on solving a Heat equation problem using parallel-in-time multigrid."); // Problem parameters cmd.addString( "f", "file", "Input XML file", fn ); - + // Spatial discretisation parameters cmd.addInt( "e", "degreeElevation", "Number of degree elevation steps to perform before solving (0: equalize degree in all directions)", numElevate ); @@ -366,12 +367,12 @@ class gsXBraid_app : public gsXBraid< gsMatrix > cmd.addInt( "n", "numSteps", "Number of parallel-in-time steps", numSteps ); cmd.addInt( "T", "typeMethod", "Time-stepping scheme", typeMethod); cmd.addReal( "t", "tfinal", "Final time", tfinal ); - + cmd.getValues(argc,argv); // Create instance gsXBraid_app app(comm, 0.0, tfinal, typeMethod, numSteps, numRefine, numElevate, numIncrease, fn); - + return app; } @@ -383,7 +384,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > #endif { gsMatrix* u = new gsMatrix(M.numDofs(), 1); - + if (t != tstart) { // Intermediate solution u->setZero(M.numDofs()); @@ -395,7 +396,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > *u_ptr = (braid_Vector) u; return braid_Int(0); } - + /// Performs a single step of the parallel-in-time multigrid braid_Int Step(braid_Vector u, braid_Vector ustop, @@ -413,7 +414,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > gsMatrix* fstop_ptr = (gsMatrix*) fstop; *u_ptr += *fstop_ptr; } - + // Get time step information std::pair time = static_cast(status).timeInterval(); @@ -426,7 +427,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > (M.matrix()-tstep*K.matrix())*(*u_ptr), *ustop_ptr); break; - + case gsXBraid_typeMethod::FE_BE: if (static_cast(status).level() == 0) { // Forward Euler method (fine grid) @@ -452,7 +453,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > (M.matrix()-tstep*0.5*K.matrix())*(*u_ptr), *ustop_ptr); break; - + case gsXBraid_typeMethod::CN_BE: if (static_cast(status).level() == 0) { *u_ptr = m_solver[static_cast(status).level()]->solveWithGuess(tstep*K.rhs() + @@ -469,7 +470,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > default: throw std::runtime_error("Unsupported time-stepping method"); } - + // Carry out adaptive refinement in time if (static_cast(status).level() == 0) { braid_Real error = static_cast(status).error(); @@ -479,7 +480,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > } else status.SetRFactor(1); } - + return braid_Int(0); } @@ -505,7 +506,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > static_cast(status).timeIndex() == static_cast(status).times()) { gsMatrix* u_ptr = (gsMatrix*) u; - gsInfo << "norm of the solution = " << u_ptr->norm() << std::endl; + gsInfo << "norm of the solution = " << u_ptr->norm() << std::endl; } return braid_Int(0); } @@ -523,13 +524,13 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // << " of " // << static_cast(status).levels() // << "\n"; - gsMatrix *fu_ptr = (gsMatrix*) fu; + gsMatrix *fu_ptr = (gsMatrix*) fu; gsMatrix* cu = new gsMatrix(); *cu = *fu_ptr; *cu_ptr = (braid_Vector) cu; return braid_Int(0); } - + // Performs spatial refinement braid_Int Refine(braid_Vector cu, braid_Vector *fu_ptr, @@ -543,14 +544,14 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // << " of " // << static_cast(status).levels() // << "\n"; - gsMatrix *cu_ptr = (gsMatrix*) cu; + gsMatrix *cu_ptr = (gsMatrix*) cu; gsMatrix* fu = new gsMatrix(); *fu = *cu_ptr; *fu_ptr = (braid_Vector) fu; return braid_Int(0); } }; - + } // ending namespace gismo #endif @@ -558,10 +559,19 @@ class gsXBraid_app : public gsXBraid< gsMatrix > int main(int argc, char**argv) { #ifdef GISMO_WITH_XBRAID - + // Initialize the MPI environment and obtain the world communicator gsMpiComm comm = gsMpi::init(argc, argv).worldComm(); + // Print MPI/OpenMP configuration + if (comm.rank() == 0) + { + gsInfo << "Number of MPI processes : " << comm.size() << std::endl; +#ifdef _OPENMP + gsInfo << "Number of OpenMP processes : " << omp_get_num_procs() << std::endl; +#endif + } + // Set up app structure gsXBraid_app app = gsXBraid_app::create(comm, argc, argv); @@ -571,9 +581,9 @@ int main(int argc, char**argv) #else gsInfo << "\n"; - + #endif return 0; - + } diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml index 97f29ab929..ff89b61e9c 100644 --- a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -87,13 +87,13 @@ - + - + From 9eac422295ead01c87c27a1f659bb41e25838337 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Mon, 27 Sep 2021 17:16:41 +0200 Subject: [PATCH 036/174] Updated CircleCI configuration --- .circleci/config.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4b9cea79c3..ceb4375754 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,9 +1,9 @@ version: 2.0 jobs: - macos_x86_64_xcode9_cxx98_release: + macos_x86_64_xcode10_cxx98_release: macos: - xcode: "9.4.1" + xcode: "10.3.0" filters: branches: - only: stable @@ -17,14 +17,14 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DBUILDNAME="macos_x86_64_xcode9_cxx98_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=98 -DGISMO_WITH_ONURBS=ON + command: cmake . -DBUILDNAME="macos_x86_64_xcode10_cxx98_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=98 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest $MAKEFLAGS --output-on-failure -D ExperimentalStart -D ExperimentalConfigure -D ExperimentalBuild -D ExperimentalTest -D ExperimentalSubmit #-D ExperimentalMemCheck - macos_x86_64_xcode10_cxx11_release: + macos_x86_64_xcode11_cxx11_release: macos: - xcode: "10.3.0" + xcode: "11.7.0" filters: branches: - only: stable @@ -38,14 +38,14 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DBUILDNAME="macos_x86_64_xcode10_cxx11_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + command: cmake . -DBUILDNAME="macos_x86_64_xcode11_cxx11_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest $MAKEFLAGS --output-on-failure -D ExperimentalStart -D ExperimentalConfigure -D ExperimentalBuild -D ExperimentalTest -D ExperimentalSubmit #-D ExperimentalMemCheck - macos_x86_64_xcode11_cxx14_release: + macos_x86_64_xcode12_cxx14_release: macos: - xcode: "11.7.0" + xcode: "12.5.1" filters: branches: - only: stable @@ -59,14 +59,14 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DBUILDNAME="macos_x86_64_xcode11_cxx14_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON + command: cmake . -DBUILDNAME="macos_x86_64_xcode12_cxx14_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest $MAKEFLAGS --output-on-failure -D ExperimentalStart -D ExperimentalConfigure -D ExperimentalBuild -D ExperimentalTest -D ExperimentalSubmit #-D ExperimentalMemCheck - macos_x86_64_xcode12_cxx17_release: + macos_x86_64_xcode13_cxx17_release: macos: - xcode: "12.2.0" + xcode: "13.0.0" filters: branches: - only: stable @@ -80,7 +80,7 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DBUILDNAME="macos_x86_64_xcode12_cxx17_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON + command: cmake . -DBUILDNAME="macos_x86_64_xcode13_cxx17_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest $MAKEFLAGS --output-on-failure -D ExperimentalStart -D ExperimentalConfigure -D ExperimentalBuild -D ExperimentalTest -D ExperimentalSubmit #-D ExperimentalMemCheck @@ -89,7 +89,7 @@ workflows: version: 2 build: jobs: - - macos_x86_64_xcode9_cxx98_release - - macos_x86_64_xcode10_cxx11_release - - macos_x86_64_xcode11_cxx14_release - - macos_x86_64_xcode12_cxx17_release + - macos_x86_64_xcode10_cxx98_release + - macos_x86_64_xcode11_cxx11_release + - macos_x86_64_xcode12_cxx14_release + - macos_x86_64_xcode13_cxx17_release From 8bbb591c8f12a3cfcf78a2d7d2021b117d379935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 27 Sep 2021 19:18:17 +0200 Subject: [PATCH 037/174] Update config.yml Fixed problem with with shallow clones of homebrew-core and homebrew-cask, cf. https://github.com/Homebrew/discussions/discussions/226 --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ceb4375754..d3ddb3d81d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,7 +13,7 @@ jobs: steps: - run: name: Install dependencies - command: brew update; brew install cmake + command: git -C /usr/local/Homebrew/Library/Taps/homebrew/homebrew-core fetch --unshallow; git -C /usr/local/Homebrew/Library/Taps/homebrew/homebrew-cask fetch --unshallow; brew update; brew install cmake - checkout - run: name: Configure G+Smo on MacOS From ae2e7e26a37ba282f2c605356886b7d69fb68f61 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 18 Nov 2021 21:53:41 +0100 Subject: [PATCH 038/174] Fixed segfault in heatEquation_example2 --- examples/heatEquation_example2.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/examples/heatEquation_example2.cpp b/examples/heatEquation_example2.cpp index 15ff32a28c..d7f2836b31 100644 --- a/examples/heatEquation_example2.cpp +++ b/examples/heatEquation_example2.cpp @@ -18,7 +18,8 @@ using namespace gismo; int main(int argc, char *argv[]) { - gsCmdLine cmd("Testing the heat equation."); + gsCmdLine cmd("Testing the heat equation."); + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } // Source function gsConstantFunction<> f(1,2); @@ -32,11 +33,13 @@ int main(int argc, char *argv[]) gsBoundaryConditions<> bcInfo; gsConstantFunction<> g_N(1,2); // Neumann gsConstantFunction<> g_D(0,2); // Dirichlet + bcInfo.setGeoMap(patches); bcInfo.addCondition(0, boundary::west, condition_type::neumann , &g_N); bcInfo.addCondition(0, boundary::east, condition_type::dirichlet, &g_D); bcInfo.addCondition(0, boundary::north, condition_type::dirichlet, &g_D); bcInfo.addCondition(0, boundary::south, condition_type::dirichlet, &g_D); - + gsInfo<<"Boundary conditions:\n"<< bcInfo <<"\n"; + gsMultiBasis<> bases( patches ); // Number for h-refinement of the computational (trial/test) basis. @@ -64,6 +67,8 @@ int main(int argc, char *argv[]) for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); + gsInfo << "Patches: "<< patches.nPatches() <<", degree: "<< bases.minCwiseDegree() <<"\n"; + real_t theta = 0.5; real_t endTime = 0.1; int numSteps = 40; @@ -78,6 +83,9 @@ int main(int argc, char *argv[]) gsExprAssembler<> K(1,1); gsExprAssembler<> M(1,1); + + gsInfo<<"Active options:\n"<< K.options() <<"\n"; + gsInfo<<"Active options:\n"<< M.options() <<"\n"; typedef gsExprAssembler<>::geometryMap geometryMap; typedef gsExprAssembler<>::variable variable; @@ -87,6 +95,9 @@ int main(int argc, char *argv[]) K.setIntegrationElements(bases); M.setIntegrationElements(bases); + gsExprEvaluator<> evK(K); + gsExprEvaluator<> evM(M); + // Set the geometry map geometryMap G_K = K.getMap(patches); geometryMap G_M = M.getMap(patches); @@ -94,17 +105,20 @@ int main(int argc, char *argv[]) // Set the discretization space space u_K = K.getSpace(bases); space u_M = M.getSpace(bases); - u_K.setInterfaceCont(0); - u_M.setInterfaceCont(0); - u_K.addBc( bcInfo.get("Dirichlet") ); - u_M.addBc( bcInfo.get("Dirichlet") ); + // u_K.setInterfaceCont(0); + // u_M.setInterfaceCont(0); + // u_K.addBc( bcInfo.get("Dirichlet") ); + // u_M.addBc( bcInfo.get("Dirichlet") ); + + u_K.setup(bcInfo, dirichlet::interpolation, 0); + u_M.setup(bcInfo, dirichlet::interpolation, 0); // Set the source term variable ff_K = K.getCoeff(f, G_K); variable ff_M = M.getCoeff(f, G_M); - K.initSystem(); - M.initSystem(); + K.initSystem(false); + M.initSystem(false); K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); From 0a66c256b3eb80c74f0e3639174e70eb3c10cd8c Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 18 Nov 2021 22:06:34 +0100 Subject: [PATCH 039/174] Renamed heatEquation_example2 into heatEquation2_example --- examples/{heatEquation_example2.cpp => heatEquation2_example.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{heatEquation_example2.cpp => heatEquation2_example.cpp} (100%) diff --git a/examples/heatEquation_example2.cpp b/examples/heatEquation2_example.cpp similarity index 100% rename from examples/heatEquation_example2.cpp rename to examples/heatEquation2_example.cpp From 97b854b5c2899f8db883e7ac3b0717dbfe01c07d Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 18 Nov 2021 22:09:50 +0100 Subject: [PATCH 040/174] Updated comment --- examples/heatEquation2_example.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/heatEquation2_example.cpp b/examples/heatEquation2_example.cpp index d7f2836b31..a86ccab184 100644 --- a/examples/heatEquation2_example.cpp +++ b/examples/heatEquation2_example.cpp @@ -1,4 +1,4 @@ -/** @file heatEquation_example.cpp +/** @file heatEquation2_example.cpp @brief Solves the heat equation using time-stepping From 6dc3b34981a4bd428a615648d5cb1a878eea72d0 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 19 Nov 2021 11:30:13 +0100 Subject: [PATCH 041/174] Added initial version of a performance benchmark --- examples/performance_benchmark.cpp | 251 +++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 examples/performance_benchmark.cpp diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp new file mode 100644 index 0000000000..0404c288b8 --- /dev/null +++ b/examples/performance_benchmark.cpp @@ -0,0 +1,251 @@ +/** @file performance_benchmark.cpp + + @brief G+Smo performance benchmark + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +//! [Include namespace] +#include + +#include + +using namespace gismo; +//! [Include namespace] + +/** + Benchmark driver +*/ +template +std::vector< std::array > +benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) +{ + gsStopwatch stopwatch; + std::size_t nbytes; + double bandwidth; + + std::vector< std::array > results; + + for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { + + omp_set_num_threads(*it); + bandwidth = 0.0; + + for (int run=0; run +class benchmark_c_array_memcopy +{ +private: + std::size_t n; + +public: + benchmark_c_array_memcopy(std::size_t n) + : n(n) + {} + + std::size_t operator()() + { + T* m_x = new T[n]; + T* m_y = new T[n]; + +#pragma omp parallel for simd + for (std::size_t i=0; i +class benchmark_c_array_dotproduct +{ +private: + std::size_t n; + +public: + benchmark_c_array_dotproduct(std::size_t n) + : n(n) + {} + + std::size_t operator()() + { + T* m_x = new T[n]; + T* m_y = new T[n]; + +#pragma omp parallel for simd + for (std::size_t i=0; i +class benchmark_eigen_vector_memcopy +{ +private: + std::size_t n; + +public: + benchmark_eigen_vector_memcopy(std::size_t n) + : n(n) + {} + + std::size_t operator()() + { + gsVector x(n); + gsVector y(n); + + x.fill((T)1.0); + y = x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * 3 * n; + } +}; + +/** + Benchmark: eigen vector dot-product +*/ +template +class benchmark_eigen_vector_dotproduct +{ +private: + std::size_t n; + +public: + benchmark_eigen_vector_dotproduct(std::size_t n) + : n(n) + {} + + std::size_t operator()() + { + gsVector x(n); + gsVector y(n); + + x.fill((T)1.0); + y.fill((T)1.0); + + T sum = x.dot(y); + + // Needed to make sure the compiler does not eliminate this code block + T tmp = sum; + GISMO_UNUSED(tmp); + + return sizeof(T) * 4 * n; + } +}; + +int main(int argc, char *argv[]) +{ + //! [Parse command line] + std::vector nthreads; + std::vector bandwidths; + int nruns=1; + + gsCmdLine cmd("G+Smo performance benchmark."); + cmd.addMultiInt("t", "threads", + "Number of OpenMP threads to be used for the benchmark", nthreads); + cmd.addInt("r", "runs", + "Number of runs over which the results are averaged", nruns); + + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } + + if (nthreads.empty()) { + for(int i=1; i<=omp_get_max_threads(); i*=2) + nthreads.push_back(i); + } + + { + gsInfo << "=== Native C array memcopy ===\n"; + benchmark_c_array_memcopy benchmark(1000000000); + auto results = benchmark_driver(nthreads, nruns, benchmark); + for (auto it=results.cbegin(); it!=results.cend(); ++it) + gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + } + + { + gsInfo << "== gsVector memcopy ===\n"; + benchmark_eigen_vector_memcopy benchmark(1000000000); + auto results = benchmark_driver(nthreads, nruns, benchmark); + for (auto it=results.cbegin(); it!=results.cend(); ++it) + gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + } + + { + gsInfo << "=== Native C array dot-product ===\n"; + benchmark_c_array_dotproduct benchmark(1000000000); + auto results = benchmark_driver(nthreads, nruns, benchmark); + for (auto it=results.cbegin(); it!=results.cend(); ++it) + gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + } + + { + gsInfo << "== gsVector dot-product ===\n"; + benchmark_eigen_vector_dotproduct benchmark(1000000000); + auto results = benchmark_driver(nthreads, nruns, benchmark); + for (auto it=results.cbegin(); it!=results.cend(); ++it) + gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + } + + return EXIT_SUCCESS; +} From 2afa1a04ce8b0fb04f13d9b2e124320d214f52c5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 19 Nov 2021 11:54:03 +0100 Subject: [PATCH 042/174] Updated performance benchmark --- examples/performance_benchmark.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 0404c288b8..78b6086018 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -45,7 +45,7 @@ benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) bandwidth += 1e-9*nbytes/stopwatch.elapsed(); } - results.push_back( { *it, bandwidth/(double)nruns, stopwatch.elapsed() } ); + results.push_back( { static_cast(*it), bandwidth/(double)nruns, stopwatch.elapsed() } ); } return results; @@ -200,11 +200,14 @@ int main(int argc, char *argv[]) //! [Parse command line] std::vector nthreads; std::vector bandwidths; + int n=1000000000; int nruns=1; gsCmdLine cmd("G+Smo performance benchmark."); cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); + cmd.addInt("n", "nlength", + "Number of unknowns in vector-type benchmarks", n); cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); @@ -217,7 +220,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array memcopy ===\n"; - benchmark_c_array_memcopy benchmark(1000000000); + benchmark_c_array_memcopy benchmark(n); auto results = benchmark_driver(nthreads, nruns, benchmark); for (auto it=results.cbegin(); it!=results.cend(); ++it) gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; @@ -225,7 +228,7 @@ int main(int argc, char *argv[]) { gsInfo << "== gsVector memcopy ===\n"; - benchmark_eigen_vector_memcopy benchmark(1000000000); + benchmark_eigen_vector_memcopy benchmark(n); auto results = benchmark_driver(nthreads, nruns, benchmark); for (auto it=results.cbegin(); it!=results.cend(); ++it) gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; @@ -233,7 +236,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array dot-product ===\n"; - benchmark_c_array_dotproduct benchmark(1000000000); + benchmark_c_array_dotproduct benchmark(n); auto results = benchmark_driver(nthreads, nruns, benchmark); for (auto it=results.cbegin(); it!=results.cend(); ++it) gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; @@ -241,7 +244,7 @@ int main(int argc, char *argv[]) { gsInfo << "== gsVector dot-product ===\n"; - benchmark_eigen_vector_dotproduct benchmark(1000000000); + benchmark_eigen_vector_dotproduct benchmark(n); auto results = benchmark_driver(nthreads, nruns, benchmark); for (auto it=results.cbegin(); it!=results.cend(); ++it) gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; From cda8eaa613c0485672db48358c1bae9bb3a2618d Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 30 Nov 2021 08:09:06 +0100 Subject: [PATCH 043/174] Updated performance benchmark --- examples/performance_benchmark.cpp | 613 ++++++++++++++++++++++++----- 1 file changed, 506 insertions(+), 107 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 78b6086018..60d1dde3f8 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -20,8 +20,8 @@ using namespace gismo; //! [Include namespace] /** - Benchmark driver -*/ + * Benchmark: driver function + */ template std::vector< std::array > benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) @@ -31,83 +31,253 @@ benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) double bandwidth; std::vector< std::array > results; - - for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { - omp_set_num_threads(*it); - bandwidth = 0.0; + try { + for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { + + omp_set_num_threads(*it); + bandwidth = 0.0; + + for (int run=0; run(*it), + bandwidth/(double)nruns, + stopwatch.elapsed() } ); + } + } + catch(...) { + std::exception_ptr p = std::current_exception(); + std::clog <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl; + } + + return results; +} + +/** + * Benchmark LaTeX output + */ +class benchmark_latex +{ +public: + /** + * Result set class + */ + class result_set + { + public: + result_set(const std::string& label, + const std::string& title, + const std::vector< std::array >& results) + : label(label), + title(title), + results(results) + { + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector< std::array >& get_results() const + { return results; } + + std::ostream &print(std::ostream &os) const + { + os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" + << "threads & " << label << " \\\\\n"; + + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)[0] << "&" << (*it)[1] << "\\\\\n"; + + os << "}\\data" << label << "\n"; + + return os; + } + + private: + const std::string label, title; + std::vector< std::array > results; + }; + + /** + * Benchmark set class + */ + class benchmark_set + { + public: + benchmark_set(const std::string& label, + const std::string& title) + : id('A'), + label(label), + title(title) + {} + + ~benchmark_set() + { + for (auto it=results.begin(); it!=results.end(); ++it) + delete (*it); + } + + void add_results(const std::string& label, + const std::string& title, + const std::vector< std::array >& results) + { + this->results.emplace_back(new result_set(label+std::string(1,id++), title, results)); + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector& get_results() const + { return results; } + + std::ostream &print(std::ostream &os) const + { + for (auto it=results.cbegin(); it!=results.cend(); ++it) + (*it)->print(os); + + os << "\\begin{tikzpicture}\n" + << "\\begin{axis}[\n" + << "width=\\textwidth,\n" + << "height=.5\\textwidth,\n" + << "legend pos=outer north east,\n" + << "symbolic x coords={"; - for (int run=0; runget_results().cbegin(); + it!=(*results.cbegin())->get_results().cend(); ++it) + os << (*it)[0] << (it!=(*results.cbegin())->get_results().cend()-1 ? "," : ""); - bandwidth += 1e-9*nbytes/stopwatch.elapsed(); + os << "},\n" + << "xlabel={OpenMP threads},\n" + << "ylabel={bandwidth in GB/s},\n" + << "title={" << title << "},\n" + << "]"; + + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << "\\addplot table[x=threads,y=" + << (*it)->get_label() + << "]{\\data" + << (*it)->get_label() + << "};\n"; + + os << "\\legend{"; + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); + os << "}\n" + << "\\end{axis}\n" + << "\\end{tikzpicture}\n"; + + return os; } - results.push_back( { static_cast(*it), bandwidth/(double)nruns, stopwatch.elapsed() } ); + private: + char id; + const std::string label,title; + std::vector< result_set* > results; + }; + +public: + ~benchmark_latex() + { + for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) + delete (*it); + } + + benchmark_set* add_benchmark(const std::string& label, + const std::string& title) + { + benchmarks.emplace_back(new benchmark_set(label, title)); + return benchmarks.back(); + } + + const std::vector< benchmark_set* >& get_benchmarks() const + { return benchmarks; } + + std::ostream &print(std::ostream &os) const + { + os << "\\documentclass[tikz]{standalone}\n" + << "\\usepackage{pgfplots}\n" + << "\\begin{document}\n"; + + for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) + (*it)->print(os); + + os << "\\end{document}\n"; + return os; } - return results; -} +private: + std::vector< benchmark_set* > benchmarks; +}; + +/// Print (as string) operator +std::ostream &operator<<(std::ostream &os, const benchmark_latex& obj) +{ return obj.print(os); } /** - Benchmark: native C array memcopy -*/ + * Benchmark: native C array memcopy + */ template class benchmark_c_array_memcopy { private: std::size_t n; + T *m_x, *m_y; public: benchmark_c_array_memcopy(std::size_t n) - : n(n) - {} - - std::size_t operator()() + : n(n), m_x(new T[n]), m_y(new T[n]) { - T* m_x = new T[n]; - T* m_y = new T[n]; - #pragma omp parallel for simd for (std::size_t i=0; i class benchmark_c_array_dotproduct { private: std::size_t n; + T *m_x, *m_y; public: benchmark_c_array_dotproduct(std::size_t n) - : n(n) - {} - - std::size_t operator()() + : n(n), m_x(new T[n]), m_y(new T[n]) { - T* m_x = new T[n]; - T* m_y = new T[n]; - #pragma omp parallel for simd for (std::size_t i=0; i +class benchmark_c_array_axpy +{ +private: + std::size_t n; + T *m_x, *m_y, *m_z; + +public: + benchmark_c_array_axpy(std::size_t n) + : n(n), m_x(new T[n]), m_y(new T[n]), m_z(new T[n]) + { +#pragma omp parallel for simd + for (std::size_t i=0; i +class benchmark_c_array_dense_matmul +{ +private: + std::size_t n; + T *m_A, *m_x, *m_y; + +public: + benchmark_c_array_dense_matmul(std::size_t n) + : n(n), m_A(new T[n*n]), m_x(new T[n]), m_y(new T[n]) + { +#pragma omp parallel for simd + for (std::size_t i=0; i class benchmark_eigen_vector_memcopy { private: std::size_t n; + gsVector x,y; public: benchmark_eigen_vector_memcopy(std::size_t n) - : n(n) - {} - + : n(n), x(n), y(n) + { + x.fill((T)0.0); + } + std::size_t operator()() { - gsVector x(n); - gsVector y(n); - - x.fill((T)1.0); - y = x; - + y.noalias() = x; + // Needed to make sure the compiler does not eliminate this code block T tmp = y[n-1]; GISMO_UNUSED(tmp); - - return sizeof(T) * 3 * n; + + return sizeof(T) * 2 * n; } }; /** - Benchmark: eigen vector dot-product -*/ + * Benchmark: Eigen vector dot-product + */ template class benchmark_eigen_vector_dotproduct { private: std::size_t n; + gsVector x, y; public: benchmark_eigen_vector_dotproduct(std::size_t n) - : n(n) - {} - + : n(n), x(n), y(n) + { + x.fill((T)0.0); + y.fill((T)0.0); + } + std::size_t operator()() { - gsVector x(n); - gsVector y(n); - - x.fill((T)1.0); - y.fill((T)1.0); + volatile T sum = y.dot(x); + GISMO_UNUSED(sum); + + return sizeof(T) * 2 * n; + } +}; + +/** + * Benchmark: Eigen vector AXPY + */ +template +class benchmark_eigen_vector_axpy +{ +private: + std::size_t n; + gsVector x, y, z; + +public: + benchmark_eigen_vector_axpy(std::size_t n) + : n(n), x(n), y(n), z(n) + { + x.fill((T)0.0); + y.fill((T)0.0); + } + + std::size_t operator()() + { + z.noalias() = (T)3.141*x + y; - T sum = x.dot(y); - // Needed to make sure the compiler does not eliminate this code block - T tmp = sum; + T tmp = z[n-1]; GISMO_UNUSED(tmp); - - return sizeof(T) * 4 * n; + + return sizeof(T) * 3 * n; } }; - + +/** + * Benchmark: Eigen dense matrix-vector multiplication + */ +template +class benchmark_eigen_vector_dense_matmul +{ +private: + std::size_t n; + gsMatrix A; + gsVector x, y; + +public: + benchmark_eigen_vector_dense_matmul(std::size_t n) + : n(n), A(n,n), x(n), y(n) + { + A.fill(0.0); + x.fill(0.0); + } + + std::size_t operator()() + { + y.noalias() = A*x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * (2*n*n + n); + } +}; + int main(int argc, char *argv[]) { //! [Parse command line] - std::vector nthreads; - std::vector bandwidths; - int n=1000000000; + std::vector nthreads, nsizes; + std::string filename; int nruns=1; - + gsCmdLine cmd("G+Smo performance benchmark."); - cmd.addMultiInt("t", "threads", - "Number of OpenMP threads to be used for the benchmark", nthreads); - cmd.addInt("n", "nlength", - "Number of unknowns in vector-type benchmarks", n); - cmd.addInt("r", "runs", - "Number of runs over which the results are averaged", nruns); - + cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); + cmd.addMultiInt("n", "nsizes", "Number of unknowns benchmarks", nsizes); + cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); + cmd.add_String("o", "output", "Name of the file to write the output", filename) + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } - + + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(int i=1; i<=omp_get_max_threads(); i*=2) nthreads.push_back(i); } + // If empty fill with 100, 1000, 10000, 100000, 1000000 + if (nsizes.empty()) { + nsizes.push_back(1e2); + nsizes.push_back(1e3); + nsizes.push_back(1e4); + nsizes.push_back(1e5); + nsizes.push_back(1e6); + } + + benchmark_latex latex; + { - gsInfo << "=== Native C array memcopy ===\n"; - benchmark_c_array_memcopy benchmark(n); - auto results = benchmark_driver(nthreads, nruns, benchmark); - for (auto it=results.cbegin(); it!=results.cend(); ++it) - gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + auto bm = latex.add_benchmark("memcopy", "memcopy benchmark"); + { + gsInfo << "=== Native C array memcopy ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_c_array_memcopy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("nativememcopy", + "native("+std::to_string(*it)+")", + results); + } + } + + { + gsInfo << "=== gsVector memcopy ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_eigen_vector_memcopy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("eigenmemcopy", + "eigen("+std::to_string(*it)+")", + results); + } + } } { - gsInfo << "== gsVector memcopy ===\n"; - benchmark_eigen_vector_memcopy benchmark(n); - auto results = benchmark_driver(nthreads, nruns, benchmark); - for (auto it=results.cbegin(); it!=results.cend(); ++it) - gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + auto bm = latex.add_benchmark("dot-product", "dotprod"); + { + gsInfo << "=== Native C array dot-product ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_c_array_dotproduct benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("nativedotproduct", + "native("+std::to_string(*it)+")", + results); + } + } + + { + gsInfo << "=== gsVector dot-product ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_eigen_vector_dotproduct benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("eigendotproduct", + "eigen("+std::to_string(*it)+")", + results); + } + } } { - gsInfo << "=== Native C array dot-product ===\n"; - benchmark_c_array_dotproduct benchmark(n); - auto results = benchmark_driver(nthreads, nruns, benchmark); - for (auto it=results.cbegin(); it!=results.cend(); ++it) - gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + auto bm = latex.add_benchmark("AXPY", "axpy"); + { + gsInfo << "=== Native C array AXPY ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_c_array_axpy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("nativeaxpy", + "native("+std::to_string(*it)+")", + results); + } + } + + { + gsInfo << "=== gsVector AXPY ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_eigen_vector_axpy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("eigenaxpy", + "eigen("+std::to_string(*it)+")", + results); + } + } } { - gsInfo << "== gsVector dot-product ===\n"; - benchmark_eigen_vector_dotproduct benchmark(n); - auto results = benchmark_driver(nthreads, nruns, benchmark); - for (auto it=results.cbegin(); it!=results.cend(); ++it) - gsInfo << "[OMP=" << (*it)[0] << "] " << (*it)[1] << " GB/s\n"; + auto bm = latex.add_benchmark("Dense matrix-vector multiply", "densemvmul"); + { + gsInfo << "=== Native C array dense matrix-vector multiplication ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_c_array_dense_matmul benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("nativdensemvmul", + "native("+std::to_string(*it)+")", + results); + } + } + + { + gsInfo << "=== gsMatrix/gsVector dense matrix-vector multiplication ===\n"; + for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { + benchmark_eigen_vector_dense_matmul benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark); + bm->add_results("eigenmvmul", + "eigen("+std::to_string(*it)+")", + results); + } + } } + std::cout << latex << std::endl; + return EXIT_SUCCESS; } From af12677542980142fdfbcf95d4b4b836bf620ef5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 30 Nov 2021 08:30:59 +0100 Subject: [PATCH 044/174] Updated performance benchmark --- examples/performance_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 60d1dde3f8..361b61368b 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -526,7 +526,7 @@ int main(int argc, char *argv[]) cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); cmd.addMultiInt("n", "nsizes", "Number of unknowns benchmarks", nsizes); cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); - cmd.add_String("o", "output", "Name of the file to write the output", filename) + cmd.addString("o", "output", "Name of the file to write the output", filename); try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } From 0704922c036413abe82429ab50b48b138c0812bc Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 30 Nov 2021 08:43:48 +0100 Subject: [PATCH 045/174] Updated performance benchmark --- cmake/gsOptions.cmake | 2 ++ examples/performance_benchmark.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index 62e985122f..e8fbfe3d7a 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -20,8 +20,10 @@ if(EXISTS "${CMAKE_SOURCE_DIR}/.git") endif() message (" CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") message (" CMAKE_C_COMPILER ${CMAKE_C_COMPILER}") +message (" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") message (" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}") message (" CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}") +message (" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message (" GISMO_COEFF_TYPE ${GISMO_COEFF_TYPE}") message (" GISMO_INDEX_TYPE ${GISMO_INDEX_TYPE}") diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 361b61368b..e96bf8a991 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -52,8 +52,8 @@ benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) } } catch(...) { - std::exception_ptr p = std::current_exception(); - std::clog <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl; + // std::exception_ptr p = std::current_exception(); + // std::clog <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl; } return results; @@ -523,6 +523,8 @@ int main(int argc, char *argv[]) int nruns=1; gsCmdLine cmd("G+Smo performance benchmark."); + cmd.printVersion(); + cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); cmd.addMultiInt("n", "nsizes", "Number of unknowns benchmarks", nsizes); cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); From b6b7ad53623bc09db8b8ba2b3740f5084b9ab145 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 30 Nov 2021 11:29:25 +0100 Subject: [PATCH 046/174] Updated performance benchmark --- examples/performance_benchmark.cpp | 330 ++++++++++++++++++++--------- 1 file changed, 226 insertions(+), 104 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index e96bf8a991..0a8f871575 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -19,42 +19,72 @@ using namespace gismo; //! [Include namespace] +enum class benchmark_metric { + bandwidth_kb_sec, + bandwidth_mb_sec, + bandwidth_gb_sec, + bandwidth_tb_sec, + perf_kflop_sec, + perf_mflop_sec, + perf_gflop_sec, + perf_tflop_sec, + runtime_sec, +}; + /** * Benchmark: driver function */ template -std::vector< std::array > -benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark) +std::vector< std::array > +benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark, benchmark_metric metric) { gsStopwatch stopwatch; - std::size_t nbytes; - double bandwidth; + std::size_t benchmark_result; + double benchmark_metric, benchmark_runtime; - std::vector< std::array > results; + std::vector< std::array > results; try { for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { omp_set_num_threads(*it); - bandwidth = 0.0; + benchmark_runtime = 0.0; + benchmark_metric = 0.0; for (int run=0; run(*it), - bandwidth/(double)nruns, - stopwatch.elapsed() } ); + results.push_back( { static_cast(*it) /* number of OpenMP threads */, + benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, + benchmark_metric/(double)nruns /* averaged benchmark metric */, + (double)metric} /* benchmark metric */ ); } - } - catch(...) { - // std::exception_ptr p = std::current_exception(); - // std::clog <<(p ? p.__cxa_exception_type()->name() : "null") << std::endl; - } + } catch(...) {} return results; } @@ -73,7 +103,7 @@ class benchmark_latex public: result_set(const std::string& label, const std::string& title, - const std::vector< std::array >& results) + const std::vector< std::array >& results) : label(label), title(title), results(results) @@ -86,7 +116,7 @@ class benchmark_latex const std::string& get_title() const { return title; } - const std::vector< std::array >& get_results() const + const std::vector< std::array >& get_results() const { return results; } std::ostream &print(std::ostream &os) const @@ -95,7 +125,7 @@ class benchmark_latex << "threads & " << label << " \\\\\n"; for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)[0] << "&" << (*it)[1] << "\\\\\n"; + os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; os << "}\\data" << label << "\n"; @@ -104,7 +134,7 @@ class benchmark_latex private: const std::string label, title; - std::vector< std::array > results; + std::vector< std::array > results; }; /** @@ -128,7 +158,7 @@ class benchmark_latex void add_results(const std::string& label, const std::string& title, - const std::vector< std::array >& results) + const std::vector< std::array >& results) { this->results.emplace_back(new result_set(label+std::string(1,id++), title, results)); } @@ -149,19 +179,53 @@ class benchmark_latex os << "\\begin{tikzpicture}\n" << "\\begin{axis}[\n" + << "name=MyAxis,\n" << "width=\\textwidth,\n" << "height=.5\\textwidth,\n" << "legend pos=outer north east,\n" + << "symbolic x coords={"; for (auto it=(*results.cbegin())->get_results().cbegin(); it!=(*results.cbegin())->get_results().cend(); ++it) os << (*it)[0] << (it!=(*results.cbegin())->get_results().cend()-1 ? "," : ""); - os << "},\n" - << "xlabel={OpenMP threads},\n" - << "ylabel={bandwidth in GB/s},\n" - << "title={" << title << "},\n" + + << "xlabel={OpenMP threads},\n"; + + switch((benchmark_metric)(*(*results.cbegin())->get_results().cbegin())[4]) { + case benchmark_metric::bandwidth_kb_sec: + os << "ylabel={Bandwidth in KB/s},\n"; + break; + case benchmark_metric::bandwidth_mb_sec: + os << "ylabel={Bandwidth in MB/s},\n"; + break; + case benchmark_metric::bandwidth_gb_sec: + os << "ylabel={Bandwidth in GB/s},\n"; + break; + case benchmark_metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth in TB/s},\n"; + break; + case benchmark_metric::perf_kflop_sec: + os << "ylabel={Berformance in kFLOP/s},\n"; + break; + case benchmark_metric::perf_mflop_sec: + os << "ylabel={Berformance in mFLOP/s},\n"; + break; + case benchmark_metric::perf_gflop_sec: + os << "ylabel={Berformance in gFLOP/s},\n"; + break; + case benchmark_metric::perf_tflop_sec: + os << "ylabel={Berformance in tFLOP/s},\n"; + break; + case benchmark_metric::runtime_sec: + os << "ylabel={Runtime in seconds},\n"; + break; + default: + throw std::runtime_error("Unsupported metric"); + } + + os << "title={" << title << "},\n" << "]"; for (auto it=results.cbegin(); it!=results.cend(); ++it) @@ -175,7 +239,17 @@ class benchmark_latex for (auto it=results.cbegin(); it!=results.cend(); ++it) os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); os << "}\n" + << "\\end{axis}\n" + + << "\\node[below right, align=left, text=black]\n" + << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" + << "G+Smo " << GISMO_VERSION + << ", Eigen " << EIGEN_WORLD_VERSION + << "." << EIGEN_MAJOR_VERSION + << "." << EIGEN_MINOR_VERSION << "\\\\\n" + << "And another line of text here};\n" + << "\\end{tikzpicture}\n"; return os; @@ -208,7 +282,8 @@ class benchmark_latex { os << "\\documentclass[tikz]{standalone}\n" << "\\usepackage{pgfplots}\n" - << "\\begin{document}\n"; + << "\\begin{document}\n" + << "\\usetikzlibrary{calc}\n"; for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) (*it)->print(os); @@ -382,9 +457,9 @@ class benchmark_c_array_dense_matmul std::size_t operator()() { -#pragma omp parallel for simd for (std::size_t i=0; i nthreads, nsizes; - std::string filename; + std::vector nthreads, ssizes, dsizes, vsizes; + std::string fn; int nruns=1; gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); - - cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); - cmd.addMultiInt("n", "nsizes", "Number of unknowns benchmarks", nsizes); + cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); - cmd.addString("o", "output", "Name of the file to write the output", filename); + cmd.addMultiInt("d", "dsizes", "Number of unknowns in dense matrix benchmarks", dsizes); + cmd.addMultiInt("s", "ssizes", "Number of unknowns in sparse matrix benchmarks", ssizes); + cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); + cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks", vsizes); + cmd.addString("o", "output", "Name of the output file", fn); try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } @@ -538,118 +615,163 @@ int main(int argc, char *argv[]) nthreads.push_back(i); } - // If empty fill with 100, 1000, 10000, 100000, 1000000 - if (nsizes.empty()) { - nsizes.push_back(1e2); - nsizes.push_back(1e3); - nsizes.push_back(1e4); - nsizes.push_back(1e5); - nsizes.push_back(1e6); + // If empty fill with 10, 100, 1.000, 10.000 + if (dsizes.empty()) { + dsizes.push_back(1e1); + dsizes.push_back(1e2); + dsizes.push_back(1e3); + dsizes.push_back(1e4); + } + + // If empty fill with 100, 1.000, 10.000, 100.000, 1.000.000 + if (ssizes.empty()) { + ssizes.push_back(1e2); + ssizes.push_back(1e3); + ssizes.push_back(1e4); + ssizes.push_back(1e5); + ssizes.push_back(1e6); } - benchmark_latex latex; + // If empty fill with 100, 1.000, 10.000, 100.000, 1.000.000 + if (vsizes.empty()) { + vsizes.push_back(1e2); + vsizes.push_back(1e3); + vsizes.push_back(1e4); + vsizes.push_back(1e5); + vsizes.push_back(1e6); + } + benchmark_latex latex; + { - auto bm = latex.add_benchmark("memcopy", "memcopy benchmark"); + auto bm = latex.add_benchmark("memcopy", "memory copy"); { - gsInfo << "=== Native C array memcopy ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_c_array_memcopy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("nativememcopy", - "native("+std::to_string(*it)+")", - results); + gsInfo << "=== Native C array memcopy\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_c_array_memcopy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("nativememcopy", + "native("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } { - gsInfo << "=== gsVector memcopy ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_eigen_vector_memcopy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("eigenmemcopy", - "eigen("+std::to_string(*it)+")", - results); + gsInfo << "=== gsVector memcopy\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_eigen_vector_memcopy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("eigenmemcopy", + "eigen("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("dot-product", "dotprod"); + auto bm = latex.add_benchmark("dotprod", "dot-product"); { - gsInfo << "=== Native C array dot-product ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_c_array_dotproduct benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("nativedotproduct", - "native("+std::to_string(*it)+")", - results); + gsInfo << "=== Native C array dot-product\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_c_array_dotproduct benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("nativedotproduct", + "native("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } { - gsInfo << "=== gsVector dot-product ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_eigen_vector_dotproduct benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("eigendotproduct", - "eigen("+std::to_string(*it)+")", - results); + gsInfo << "=== gsVector dot-product\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_eigen_vector_dotproduct benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("eigendotproduct", + "eigen("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("AXPY", "axpy"); + auto bm = latex.add_benchmark("axpy", "axpy"); { - gsInfo << "=== Native C array AXPY ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_c_array_axpy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("nativeaxpy", - "native("+std::to_string(*it)+")", - results); + gsInfo << "=== Native C array AXPY\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_c_array_axpy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("nativeaxpy", + "native("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } { - gsInfo << "=== gsVector AXPY ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_eigen_vector_axpy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("eigenaxpy", - "eigen("+std::to_string(*it)+")", - results); + gsInfo << "=== gsVector AXPY\n"; + for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_eigen_vector_axpy benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("eigenaxpy", + "eigen("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("Dense matrix-vector multiply", "densemvmul"); + auto bm = latex.add_benchmark("densemvmul", "Dense matrix-vector multiply"); { - gsInfo << "=== Native C array dense matrix-vector multiplication ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_c_array_dense_matmul benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("nativdensemvmul", - "native("+std::to_string(*it)+")", - results); + gsInfo << "=== Native C array dense matrix-vector multiplication\n"; + for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { + gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_c_array_dense_matmul benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("nativdensemvmul", + "native("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } { - gsInfo << "=== gsMatrix/gsVector dense matrix-vector multiplication ===\n"; - for (auto it=nsizes.cbegin(); it!=nsizes.cend(); ++it) { - benchmark_eigen_vector_dense_matmul benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark); - bm->add_results("eigenmvmul", - "eigen("+std::to_string(*it)+")", - results); + gsInfo << "=== gsMatrix/gsVector dense matrix-vector multiplication\n"; + for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { + gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; + try { + benchmark_eigen_vector_dense_matmul benchmark(*it); + auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); + bm->add_results("eigenmvmul", + "eigen("+std::to_string(*it)+")", + results); + } catch(...) { gsInfo << "failed!"; } } } } - - std::cout << latex << std::endl; + if (fn.empty()) + gsInfo << latex << "\n"; + else { + //gsFileData<> fd; fd << latex << "\n"; fd.save(fn); + } + return EXIT_SUCCESS; } From 31bbf2ac2492bbf16d921bae7185137b256412ba Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 30 Nov 2021 11:29:54 +0100 Subject: [PATCH 047/174] Added distinction between Clang and Apple Clang in printVersion() --- src/gsIO/gsCmdLine.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index 14c0f2f384..e3a744181f 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -433,7 +433,11 @@ void gsCmdLine::printVersion() #elsif _MSC_VER >= 1600 gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<"201103L" <<", "; #elif defined(__clang__ ) +#if defined(__apple_build_version__) + gsInfo << "Apple Clang "<<__clang_version__<<" ("<<__cplusplus <<", "; +#else gsInfo << "Clang "<<__clang_version__<<" ("<<__cplusplus <<", "; +#endif #elif defined(_INTEL_COMPILER) gsInfo << "Intel C++ "<<__INTEL_COMPILER<<" ("<<__cplusplus <<", "; #elif defined(__MINGW64__) From 1dd58e930dc5102ce578e30e570d118b31e26ff8 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 14:12:11 +0100 Subject: [PATCH 048/174] Added function to specify the number of digits in util::to_string --- src/gsUtils/gsUtils.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/gsUtils/gsUtils.h b/src/gsUtils/gsUtils.h index 17900d6586..01647cc89b 100644 --- a/src/gsUtils/gsUtils.h +++ b/src/gsUtils/gsUtils.h @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef __GNUC__ #include @@ -59,6 +60,16 @@ std::string to_string(const C & value) return convert.str(); } +/// \brief Converts value to string, assuming "operator<<" defined on C +/// \ingroup Utils +template +std::string to_string(const C & value, int digits) +{ + std::ostringstream convert; + convert << std::scientific << std::setprecision(digits) << value; + return convert.str(); +} + /// \brief Checks if a string \a haystack begins with the string \a needle /// \ingroup Utils inline bool starts_with( const std::string & haystack, const std::string & needle ) From 7c43e1c57498af4733027677553cf74bbfac3ca1 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 14:12:42 +0100 Subject: [PATCH 049/174] Added OpenMP dummy implementation to be used if no omp.h is available --- src/gsCore/gsOpenMP.cpp | 447 ++++++++++++++++++++++++++++++++++++++++ src/gsCore/gsOpenMP.h | 201 ++++++++++++++++++ 2 files changed, 648 insertions(+) create mode 100644 src/gsCore/gsOpenMP.cpp create mode 100644 src/gsCore/gsOpenMP.h diff --git a/src/gsCore/gsOpenMP.cpp b/src/gsCore/gsOpenMP.cpp new file mode 100644 index 0000000000..e06641668f --- /dev/null +++ b/src/gsCore/gsOpenMP.cpp @@ -0,0 +1,447 @@ +/** @file gsOpenMP.cpp + + @brief Implementation of OpenMP stub routines to be used when libomp is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#if !defined(_OPENMP) + +#include + +void omp_set_num_threads(int num_threads) +{} + +int omp_get_num_threads(void) +{ + return 1; +} + +int omp_get_max_threads(void) +{ + return 1; +} + +int omp_get_thread_num(void) +{ + return 0; +} + +int omp_get_num_procs(void) +{ + return 1; +} + +int omp_in_parallel(void) +{ + return 0; +} + +void omp_set_dynamic(int dynamic_threads) +{} + +int omp_get_dynamic(void) +{ + return 0; +} + +int omp_get_cancellation(void) +{ + return 0; +} + +void omp_set_nested(int nested) +{} + +int omp_get_nested(void) +{ + return 0; +} + +void omp_set_schedule(omp_sched_t kind, int chunk_size) +{} + +void omp_get_schedule(omp_sched_t *kind, int *chunk_size) +{ + *kind = omp_sched_static; + *chunk_size = 0; +} + +int omp_get_thread_limit(void) +{ + return 1; +} + +void omp_set_max_active_levels(int max_active_levels) +{} + +int omp_get_max_active_levels(void) +{ + return 0; +} + +int omp_get_level(void) +{ + return 0; +} + +int omp_get_ancestor_thread_num(int level) +{ + if (level == 0) + { + return 0; + } + else + { + return -1; + } +} + +int omp_get_team_size(int level) +{ + if (level == 0) + { + return 1; + } + else + { + return -1; + } +} + +int omp_get_active_level(void) +{ + return 0; +} + +int omp_in_final(void) +{ + return 1; +} + +omp_proc_bind_t omp_get_proc_bind(void) +{ + return omp_proc_bind_false; +} + +int omp_get_num_places(void) +{ + return 0; +} + +int omp_get_place_num_procs(int place_num) +{ + return 0; +} + +void omp_get_place_proc_ids(int place_num, int *ids) +{} + +int omp_get_place_num(void) +{ + return -1; +} + +int omp_get_partition_num_places(void) +{ + return 0; +} + +void omp_get_partition_place_nums(int *place_nums) +{} + +void omp_set_default_device(int device_num) +{} + +int omp_get_default_device(void) +{ + return 0; +} + +int omp_get_num_devices(void) +{ + return 0; +} + +int omp_get_num_teams(void) +{ + return 1; +} + +int omp_get_team_num(void) +{ + return 0; +} + +int omp_is_initial_device(void) +{ + return 1; +} + +int omp_get_initial_device(void) +{ + return -10; +} + +int omp_get_max_task_priority(void) +{ + return 0; +} + +void omp_init_lock(omp_lock_t *arg) +{ + arg->lock = UNLOCKED; +} + +void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) +{ + omp_init_lock(arg); +} + +void omp_destroy_lock(omp_lock_t *arg) +{ + arg->lock = INIT; +} + +void omp_set_lock(omp_lock_t *arg) +{ + if (arg->lock == UNLOCKED) + { + arg->lock = LOCKED; + } + else if (arg->lock == LOCKED) + { + fprintf(stderr, "error: deadlock in using lock variable\n"); + exit(1); + } + else + { + exit(1); + } +} + +void omp_unset_lock(omp_lock_t *arg) +{ + if (arg->lock == LOCKED) + { + arg->lock = UNLOCKED; + } + else if (arg->lock == UNLOCKED) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +int omp_test_lock(omp_lock_t *arg) +{ + if (arg->lock == UNLOCKED) + { + arg->lock = LOCKED; + return 1; + } + else if (arg->lock == LOCKED) + { + return 0; + } + else { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +void omp_init_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = NOOWNER; + arg->count = 0; +} + +void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint) +{ + omp_init_nest_lock(arg); +} + +void omp_destroy_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = NOOWNER; + arg->count = UNLOCKED; +} + +void omp_set_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == MASTER && arg->count >= 1) + { + arg->count++; + } + else if (arg->owner == NOOWNER && arg->count == 0) + { + arg->owner = MASTER; + arg->count = 1; + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +void omp_unset_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == MASTER && arg->count >= 1) + { + arg->count--; + if (arg->count == 0) + { + arg->owner = NOOWNER; + } + } + else if (arg->owner == NOOWNER && arg->count == 0) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +int omp_test_nest_lock(omp_nest_lock_t *arg) +{ + omp_set_nest_lock(arg); + return arg->count; +} + +double omp_get_wtime(void) +{ + /* This function does not provide a working + * wallclock timer. Replace it with a version + * customized for the target machine. + */ + return 0.0; +} + +double omp_get_wtick(void) +{ + /* This function does not provide a working + * clock tick function. Replace it with + * a version customized for the target machine. + */ + return 365. * 86400.; +} + +void * omp_target_alloc(size_t size, int device_num) +{ + if (device_num != -10) + return NULL; + return malloc(size); +} + +void omp_target_free(void *device_ptr, int device_num) +{ + free(device_ptr); +} + +int omp_target_is_present(void *ptr, int device_num) +{ + return 1; +} + +int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device) +{ + // only the default device is valid in a stub + if (dst_device != -10 || src_device != -10 + || ! dst || ! src ) + return EINVAL; + memcpy((char *)dst + dst_offset, + (char *)src + src_offset, + length); + return 0; +} + +int omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num) +{ + int ret=0; + // Both null, return number of dimensions supported, + // this stub supports an arbitrary number + if (dst == NULL && src == NULL) return INT_MAX; + + if (!volume || !dst_offsets || !src_offsets + || !dst_dimensions || !src_dimensions + || num_dims < 1 ) { + ret = EINVAL; + goto done; + } + if (num_dims == 1) { + ret = omp_target_memcpy(dst, src, + element_size * volume[0], + dst_offsets[0] * element_size, + src_offsets[0] * element_size, + dst_device_num, src_device_num); + if(ret) goto done; + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; i < num_dims; i++) { + dst_slice_size *= dst_dimensions[i]; + src_slice_size *= src_dimensions[i]; + } + size_t dst_off = dst_offsets[0] * dst_slice_size; + size_t src_off = src_offsets[0] * src_slice_size; + for (size_t i=0; i < volume[0]; i++) { + ret = omp_target_memcpy_rect( + (char *)dst + dst_off + dst_slice_size*i, + (char *)src + src_off + src_slice_size*i, + element_size, + num_dims - 1, + volume + 1, + dst_offsets + 1, + src_offsets + 1, + dst_dimensions + 1, + src_dimensions + 1, + dst_device_num, + src_device_num); + if (ret) goto done; + } + } + done: + return ret; +} + +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num) +{ + // No association is possible because all host pointers + // are considered present + return EINVAL; +} + +int omp_target_disassociate_ptr(void *ptr, int device_num) +{ + return EINVAL; +} +#endif // !defined(_OPENMP) diff --git a/src/gsCore/gsOpenMP.h b/src/gsCore/gsOpenMP.h new file mode 100644 index 0000000000..cccedeb338 --- /dev/null +++ b/src/gsCore/gsOpenMP.h @@ -0,0 +1,201 @@ +/** @file gsOpenMP.h + + @brief OpenMP stub routines to be used when omp.h is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#ifdef _OPENMP + +#include + +#else + +#include +#include +#include +#include +#include + +void omp_set_num_threads(int num_threads); + +int omp_get_num_threads(void); + +int omp_get_max_threads(void); + +int omp_get_thread_num(void); + +int omp_get_num_procs(void); + +int omp_in_parallel(void); + +void omp_set_dynamic(int dynamic_threads); + +int omp_get_dynamic(void); + +int omp_get_cancellation(void); + +void omp_set_nested(int nested); + +int omp_get_nested(void); + +typedef enum omp_sched_t { + omp_sched_static = 1, + omp_sched_dynamic = 2, + omp_sched_guided = 3, + omp_sched_auto = 4, + omp_sched_monotonic = 0x80000000 +} omp_sched_t; + +void omp_set_schedule(omp_sched_t kind, int chunk_size); + +void omp_get_schedule(omp_sched_t *kind, int *chunk_size); + +int omp_get_thread_limit(void); + +void omp_set_max_active_levels(int max_active_levels); + +int omp_get_max_active_levels(void); + +int omp_get_level(void); + +int omp_get_ancestor_thread_num(int level); + +int omp_get_team_size(int level); + +int omp_get_active_level(void); + +int omp_in_final(void); + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +omp_proc_bind_t omp_get_proc_bind(void); + +int omp_get_num_places(void); + +int omp_get_place_num_procs(int place_num); + +void omp_get_place_proc_ids(int place_num, int *ids); + +int omp_get_place_num(void); + +int omp_get_partition_num_places(void); + +void omp_get_partition_place_nums(int *place_nums); + +void omp_set_default_device(int device_num); + +int omp_get_default_device(void); + +int omp_get_num_devices(void); + +int omp_get_num_teams(void); + +int omp_get_team_num(void); + +int omp_is_initial_device(void); + +int omp_get_initial_device(void); + +int omp_get_max_task_priority(void); + +typedef struct omp_lock_t { + int lock; +} omp_lock_t; + +enum { UNLOCKED = -1, INIT, LOCKED }; + +void omp_init_lock(omp_lock_t *arg); + +typedef enum omp_sync_hint_t { + omp_sync_hint_none = 0, + omp_lock_hint_none = omp_sync_hint_none, + omp_sync_hint_uncontended = 1, + omp_lock_hint_uncontended = omp_sync_hint_uncontended, + omp_sync_hint_contended = (1<<1), + omp_lock_hint_contended = omp_sync_hint_contended, + omp_sync_hint_nonspeculative = (1<<2), + omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative, + omp_sync_hint_speculative = (1<<3), + omp_lock_hint_speculative = omp_sync_hint_speculative, + kmp_lock_hint_hle = (1<<16), + kmp_lock_hint_rtm = (1<<17), + kmp_lock_hint_adaptive = (1<<18) +} omp_sync_hint_t; + +typedef omp_sync_hint_t omp_lock_hint_t; + +void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint); + +void omp_destroy_lock(omp_lock_t *arg); + +void omp_set_lock(omp_lock_t *arg); + +void omp_unset_lock(omp_lock_t *arg); + +int omp_test_lock(omp_lock_t *arg); + +typedef struct omp_nest_lock_t { + int owner; + int count; +} omp_nest_lock_t; + +enum { NOOWNER = -1, MASTER = 0 }; + +void omp_init_nest_lock(omp_nest_lock_t *arg); + +void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint); + +void omp_destroy_nest_lock(omp_nest_lock_t *arg); + +void omp_set_nest_lock(omp_nest_lock_t *arg); + +void omp_unset_nest_lock(omp_nest_lock_t *arg); + +int omp_test_nest_lock(omp_nest_lock_t *arg); + +double omp_get_wtime(void); + +double omp_get_wtick(void); + +void * omp_target_alloc(size_t size, int device_num); + +void omp_target_free(void *device_ptr, int device_num); + +int omp_target_is_present(void *ptr, int device_num); + +int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device); + +int omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num); + +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num); + +int omp_target_disassociate_ptr(void *ptr, int device_num); +#endif // _OPENMP From 80cdf761c711d71f75ddf83a32b192bc0cb14cb6 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 14:13:26 +0100 Subject: [PATCH 050/174] Added functionality to retrieve compiler and system information --- src/gsIO/gsCmdLine.cpp | 594 ++++++++++++++++++++++++++++++++++++++--- src/gsIO/gsCmdLine.h | 37 +++ 2 files changed, 595 insertions(+), 36 deletions(-) diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index e3a744181f..ac5decea57 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -423,62 +423,584 @@ void gsCmdLine::printVersion() gsInfo << "\n"; gsInfo << " G+Smo \n"; gsInfo << " Geometry plus Simulation modules\n"; - gsInfo << " version "<< GISMO_VERSION<<"\n"; - gsInfo << "Compiled by "; -//https://sourceforge.net/p/predef/wiki/Compilers, see also boost/predef.h -#if defined(_MSC_VER) && _MSC_VER < 1600 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<"199711L" <<", "; -#elsif _MSC_VER >= 1900 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<_MSVC_LANG <<", "; -#elsif _MSC_VER >= 1600 - gsInfo << "MSVC "<<_MSC_FULL_VER <<" ("<<"201103L" <<", "; -#elif defined(__clang__ ) -#if defined(__apple_build_version__) - gsInfo << "Apple Clang "<<__clang_version__<<" ("<<__cplusplus <<", "; + gsInfo << " version "<< getGismoVersion() << "\n"; + gsInfo << "Compiled by " << getCompilerVersion() + << " (" << getCppVersion() + << ", " << getStdLibVersion() + << ", eigen " << getEigenVersion() + << (getExtraLibsVersion().empty() ? ")\n" : getExtraLibsVersion()+")\n"); + gsInfo << "web: http://github.com/gismo\n"; +} + +std::string gsCmdLine::getGismoVersion() +{ + return util::to_string(GISMO_VERSION); +} + +std::string gsCmdLine::getEigenVersion() +{ + return util::to_string(EIGEN_WORLD_VERSION)+"." + + util::to_string(EIGEN_MAJOR_VERSION)+"." + + util::to_string(EIGEN_MINOR_VERSION); +} + +std::string gsCmdLine::getCompilerVersion() +{ + // This code is copied from the CMakeCXXCompilerId.cpp file that was + // automatically generated with CMake 3.21.4 + + // The following two macros have been modified as we do not want to + // return the compiler version in the specific CMake format +#define DEC(n) n +#define HEX(n) n + +/* Version number components: V=Version, R=Revision, P=Patch + Version date components: YYYY=Year, MM=Month, DD=Day */ + +#if defined(__COMO__) +# define COMPILER_ID "Comeau" + /* __COMO_VERSION__ = VRR */ +# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100) +# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100) + +#elif defined(__INTEL_COMPILER) || defined(__ICC) +# define COMPILER_ID "Intel" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# if defined(__GNUC__) +# define SIMULATE_ID "GNU" +# endif + /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later, + except that a few beta releases use the old format with V=2021. */ +# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111 +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) +# if defined(__INTEL_COMPILER_UPDATE) +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) +# else +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) +# endif +# else +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE) + /* The third version component from --version is an update index, + but no macro is provided for it. */ +# define COMPILER_VERSION_PATCH DEC(0) +# endif +# if defined(__INTEL_COMPILER_BUILD_DATE) + /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ +# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) +# endif +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif +# if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) +# elif defined(__GNUG__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +# endif +# if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +# endif +# if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +# endif + +#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER) +# define COMPILER_ID "IntelLLVM" +#if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +#endif +#if defined(__GNUC__) +# define SIMULATE_ID "GNU" +#endif +/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and + * later. Look for 6 digit vs. 8 digit version number to decide encoding. + * VVVV is no smaller than the current year when a version is released. + */ +#if __INTEL_LLVM_COMPILER < 1000000L +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10) #else - gsInfo << "Clang "<<__clang_version__<<" ("<<__cplusplus <<", "; +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100) #endif -#elif defined(_INTEL_COMPILER) - gsInfo << "Intel C++ "<<__INTEL_COMPILER<<" ("<<__cplusplus <<", "; -#elif defined(__MINGW64__) - gsInfo << "MinGW "<<__MINGW64_VERSION_MAJOR<<"."<<__MINGW64_VERSION_MINOR<<" ("<<__cplusplus <<", "; -#elif defined(__SUNPRO_CC) - gsInfo << "Solaris Studio "<<__SUNPRO_CC<<" ("<<__cplusplus <<", "; +#if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +#endif +#if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) #elif defined(__GNUG__) - gsInfo << "GNU GCC "<<__GNUC__<<"."<<__GNUC_MINOR__<<"."<<__GNUC_PATCHLEVEL__<<" ("<<__cplusplus <<", "; +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +#endif +#if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +#endif +#if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +#endif + +#elif defined(__PATHCC__) +# define COMPILER_ID "PathScale" +# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) +# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) +# if defined(__PATHCC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) +# endif + +#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) +# define COMPILER_ID "Embarcadero" +# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) +# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) +# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) + +#elif defined(__BORLANDC__) +# define COMPILER_ID "Borland" + /* __BORLANDC__ = 0xVRR */ +# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) +# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) + +#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 +# define COMPILER_ID "Watcom" + /* __WATCOMC__ = VVRR */ +# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__WATCOMC__) +# define COMPILER_ID "OpenWatcom" + /* __WATCOMC__ = VVRP + 1100 */ +# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__SUNPRO_CC) +# define COMPILER_ID "SunPro" +# if __SUNPRO_CC >= 0x5100 + /* __SUNPRO_CC = 0xVRRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# else + /* __SUNPRO_CC = 0xVRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# endif + +#elif defined(__HP_aCC) +# define COMPILER_ID "HP" + /* __HP_aCC = VVRRPP */ +# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000) +# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__HP_aCC % 100) + +#elif defined(__DECCXX) +# define COMPILER_ID "Compaq" + /* __DECCXX_VER = VVRRTPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000) +# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000 % 100) +# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER % 10000) + +#elif defined(__IBMCPP__) && defined(__COMPILER_VER__) +# define COMPILER_ID "zOS" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__ibmxl__) && defined(__clang__) +# define COMPILER_ID "XLClang" +# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__) +# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__) +# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__) +# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__) + + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800 +# define COMPILER_ID "XL" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800 +# define COMPILER_ID "VisualAge" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__NVCOMPILER) +# define COMPILER_ID "NVHPC" +# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__) +# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__) +# if defined(__NVCOMPILER_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__) +# endif + +#elif defined(__PGI) +# define COMPILER_ID "PGI" +# define COMPILER_VERSION_MAJOR DEC(__PGIC__) +# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) +# if defined(__PGIC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) +# endif + +#elif defined(_CRAYC) +# define COMPILER_ID "Cray" +# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) +# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) + +#elif defined(__TI_COMPILER_VERSION__) +# define COMPILER_ID "TI" + /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ +# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) +# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) +# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) + +#elif defined(__CLANG_FUJITSU) +# define COMPILER_ID "FujitsuClang" +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# define COMPILER_VERSION_INTERNAL_STR __clang_version__ + + +#elif defined(__FUJITSU) +# define COMPILER_ID "Fujitsu" +# if defined(__FCC_version__) +# define COMPILER_VERSION __FCC_version__ +# elif defined(__FCC_major__) +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# endif +# if defined(__fcc_version) +# define COMPILER_VERSION_INTERNAL DEC(__fcc_version) +# elif defined(__FCC_VERSION) +# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION) +# endif + + +#elif defined(__ghs__) +# define COMPILER_ID "GHS" +/* __GHS_VERSION_NUMBER = VVVVRP */ +# ifdef __GHS_VERSION_NUMBER +# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100) +# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10) +# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10) +# endif + +#elif defined(__SCO_VERSION__) +# define COMPILER_ID "SCO" + +#elif defined(__ARMCC_VERSION) && !defined(__clang__) +# define COMPILER_ID "ARMCC" +#if __ARMCC_VERSION >= 1000000 + /* __ARMCC_VERSION = VRRPPPP */ + # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) + # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) + # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) #else - gsInfo << "C++ ("<<__cplusplus <<", "; + /* __ARMCC_VERSION = VRPPPP */ + # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) + # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) + # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) #endif -#ifdef __INTEL_MKL__ - gsInfo << "MKL "<= 1400 + /* _MSC_FULL_VER = VVRRPPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) +# else + /* _MSC_FULL_VER = VVRRPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) +# endif +# endif +# if defined(_MSC_BUILD) +# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) +# endif + +#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) +# define COMPILER_ID "ADSP" +#if defined(__VISUALDSPVERSION__) + /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ +# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) +# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) +#endif + +#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC) +# define COMPILER_ID "IAR" +# if defined(__VER__) && defined(__ICCARM__) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000) +# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000) +# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__)) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100) +# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100)) +# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# endif + + +/* These compilers are either not known or too old to define an + identification macro. Try to identify the platform and guess that + it is the native compiler. */ +#elif defined(__hpux) || defined(__hpua) +# define COMPILER_ID "HP" + +#else /* unknown compiler */ +# define COMPILER_ID "Unknown-Compiler" +#endif + + return util::to_string(COMPILER_ID) +#ifdef COMPILER_VERSION + +" "+util::to_string(COMPILER_VERSION); +#elif defined(COMPILER_VERSION_MAJOR) + +" "+util::to_string(COMPILER_VERSION_MAJOR) +# ifdef COMPILER_VERSION_MINOR + +"."+util::to_string(COMPILER_VERSION_MINOR) +# ifdef COMPILER_VERSION_PATCH + +"."+util::to_string(COMPILER_VERSION_PATCH) +# ifdef COMPILER_VERSION_TWEAK + +"."+util::to_string(COMPILER_VERSION_TWEAK) +# endif +# endif +# endif + ; +#endif + +#undef DEC +#undef HEX +#undef COMPILER_ID +#undef COMPILER_VERSION +#undef COMPILER_VERSION_MAJOR +#undef COMPILER_VERSION_MINOR +#undef COMPILER_VERSION_PATCH +#undef COMPILER_VERSION_TWEAK +#undef SIMULATE_VERSION_MAJOR +#undef SIMULATE_VERSION_MINOR +#undef SIMULATE_VERSION_PATCH +#undef SIMULATE_VERSION_TWEAK +} + +std::string gsCmdLine::getCppVersion() +{ +#if defined(_MSC_VER) && _MSC_VER < 1600 + return "C++ 199711L"; +#elsif _MSC_VER >= 1900 + return "C++ "+util::to_string(_MSVC_LANG); +#elsif _MSC_VER >= 1600 + return "C++ 201103L"; +#else + return "C++ "+util::to_string(__cplusplus); #endif +} +std::string gsCmdLine::getStdLibVersion() +{ #ifdef _LIBCPP_VERSION - gsInfo << "libc++ "<<_LIBCPP_VERSION <<")\n"; + return "libc++ "+util::to_string(_LIBCPP_VERSION); # elif defined(__GLIBCXX__) - gsInfo << "glibc++ "<< __GLIBCXX__ <<")\n"; + return "glibc++ "+util::to_string(__GLIBCXX__); # elif defined(__GLIBCPP__) - gsInfo << "glibc++ "<< __GLIBCPP__ <<")\n"; + return "glibc++ "+util::to_string(__GLIBCPP__); #elif defined(__LIBCOMO__) - gsInfo << "Comeau STL "<< __LIBCOMO__ <<")\n"; + return "Comeau STL "+util::to_string(__LIBCOMO__); # elif defined(__STL_CONFIG_H) - gsInfo << "SGI STL)\n"; + return "SGI STL"; # elif defined(__MSL_CPP__) - gsInfo << "MSL standard lib)\n"; + return "MSL standard lib"; # elif defined(__IBMCPP__) - gsInfo << "VACPP STL)\n"; + return "VACPP STL"; # elif defined(MSIPL_COMPILE_H) - gsInfo << "Modena C++ STL)\n"; + return "Modena C++ STL"; # elif (defined(_YVALS) && !defined(__IBMCPP__)) || defined(_CPPLIB_VER) - gsInfo << "Dinkumware STL "<< _CPPLIB_VER<<")\n"; + return "Dinkumware STL "+util::to_string(_CPPLIB_VER); # elif defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER) - gsInfo << "Rogue Wave lib "<<_RWSTD_VER<<")\n"; + return "Rogue Wave lib "+util::to_string(_RWSTD_VER); #else - gsInfo << "Unknown-STD)\n"; + return "Unknown-STD"; #endif - //gsInfo << "Eigen "<< EIGEN_WORLD_VERSION<<"."< 0) { + if (CPUBrandString[size-1] == '\0') + size--; + CPUBrandString.resize(size); + return CPUBrandString; + } + +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + + char CPUBrandString[0x40]; + unsigned int CPUInfo[4] = {0,0,0,0}; + + __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + unsigned int nExIds = CPUInfo[0]; + + memset(CPUBrandString, 0, sizeof(CPUBrandString)); + + for (unsigned int i = 0x80000000; i <= nExIds; ++i) + { + __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + + if (i == 0x80000002) + memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); + } + + return CPUBrandString; + +# endif +#elif __unix__ +#endif + + return "Unknown-CPU"; +} + +std::string gsCmdLine::getMemoryInfo() +{ + +#if defined(_WIN32) || defined(_WIN64) + + +#elif __APPLE__ + + int64_t memsize; + std::size_t size = sizeof(memsize); + + if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { + return util::to_string(memsize / 1024 / 1024)+" MB"; + } + +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) + + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return util::to_string(pages * page_size / 1024 / 1024)+" MB"; + +# endif +#elif __unix__ +#endif + + return "Unknown-Memory"; } std::string & gsCmdLine::getMessage() diff --git a/src/gsIO/gsCmdLine.h b/src/gsIO/gsCmdLine.h index a8cde98054..742fe66645 100644 --- a/src/gsIO/gsCmdLine.h +++ b/src/gsIO/gsCmdLine.h @@ -13,7 +13,20 @@ #pragma once +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) +# include +# endif +#elif __unix__ +#endif + #include +#include namespace gismo { @@ -247,6 +260,30 @@ class GISMO_EXPORT gsCmdLine /// Prints the version information static void printVersion(); + /// Returns the version of G+Smo + static std::string getGismoVersion(); + + /// Returns the version of Eigen + static std::string getEigenVersion(); + + /// Returns the version of the compiler + static std::string getCompilerVersion(); + + /// Returns the version of the C++ standard + static std::string getCppVersion(); + + /// Returns the version of the standard library + static std::string getStdLibVersion(); + + /// Returns the version of extra libraries + static std::string getExtraLibsVersion(); + + /// Returns CPU information + static std::string getCpuInfo(); + + /// Returns memory information + static std::string getMemoryInfo(); + /// Returns the program's description (as specified in the constructor) std::string& getMessage(); From acd04402d3daafd1aa1b4c724169d7ea20419775 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 14:13:54 +0100 Subject: [PATCH 051/174] Updated performance benchmark application --- examples/performance_benchmark.cpp | 822 +++++++++++++++-------------- 1 file changed, 431 insertions(+), 391 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 0a8f871575..8f04c09e1c 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -13,293 +13,16 @@ //! [Include namespace] #include +#include #include +#include using namespace gismo; //! [Include namespace] -enum class benchmark_metric { - bandwidth_kb_sec, - bandwidth_mb_sec, - bandwidth_gb_sec, - bandwidth_tb_sec, - perf_kflop_sec, - perf_mflop_sec, - perf_gflop_sec, - perf_tflop_sec, - runtime_sec, -}; - -/** - * Benchmark: driver function - */ -template -std::vector< std::array > -benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark, benchmark_metric metric) -{ - gsStopwatch stopwatch; - std::size_t benchmark_result; - double benchmark_metric, benchmark_runtime; - - std::vector< std::array > results; - - try { - for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { - - omp_set_num_threads(*it); - benchmark_runtime = 0.0; - benchmark_metric = 0.0; - - for (int run=0; run(*it) /* number of OpenMP threads */, - benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, - benchmark_metric/(double)nruns /* averaged benchmark metric */, - (double)metric} /* benchmark metric */ ); - } - } catch(...) {} - - return results; -} - -/** - * Benchmark LaTeX output - */ -class benchmark_latex -{ -public: - /** - * Result set class - */ - class result_set - { - public: - result_set(const std::string& label, - const std::string& title, - const std::vector< std::array >& results) - : label(label), - title(title), - results(results) - { - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector< std::array >& get_results() const - { return results; } - - std::ostream &print(std::ostream &os) const - { - os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" - << "threads & " << label << " \\\\\n"; - - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; - - os << "}\\data" << label << "\n"; - - return os; - } - - private: - const std::string label, title; - std::vector< std::array > results; - }; - - /** - * Benchmark set class - */ - class benchmark_set - { - public: - benchmark_set(const std::string& label, - const std::string& title) - : id('A'), - label(label), - title(title) - {} - - ~benchmark_set() - { - for (auto it=results.begin(); it!=results.end(); ++it) - delete (*it); - } - - void add_results(const std::string& label, - const std::string& title, - const std::vector< std::array >& results) - { - this->results.emplace_back(new result_set(label+std::string(1,id++), title, results)); - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector& get_results() const - { return results; } - - std::ostream &print(std::ostream &os) const - { - for (auto it=results.cbegin(); it!=results.cend(); ++it) - (*it)->print(os); - - os << "\\begin{tikzpicture}\n" - << "\\begin{axis}[\n" - << "name=MyAxis,\n" - << "width=\\textwidth,\n" - << "height=.5\\textwidth,\n" - << "legend pos=outer north east,\n" - - << "symbolic x coords={"; - - for (auto it=(*results.cbegin())->get_results().cbegin(); - it!=(*results.cbegin())->get_results().cend(); ++it) - os << (*it)[0] << (it!=(*results.cbegin())->get_results().cend()-1 ? "," : ""); - os << "},\n" - - << "xlabel={OpenMP threads},\n"; - - switch((benchmark_metric)(*(*results.cbegin())->get_results().cbegin())[4]) { - case benchmark_metric::bandwidth_kb_sec: - os << "ylabel={Bandwidth in KB/s},\n"; - break; - case benchmark_metric::bandwidth_mb_sec: - os << "ylabel={Bandwidth in MB/s},\n"; - break; - case benchmark_metric::bandwidth_gb_sec: - os << "ylabel={Bandwidth in GB/s},\n"; - break; - case benchmark_metric::bandwidth_tb_sec: - os << "ylabel={Bandwidth in TB/s},\n"; - break; - case benchmark_metric::perf_kflop_sec: - os << "ylabel={Berformance in kFLOP/s},\n"; - break; - case benchmark_metric::perf_mflop_sec: - os << "ylabel={Berformance in mFLOP/s},\n"; - break; - case benchmark_metric::perf_gflop_sec: - os << "ylabel={Berformance in gFLOP/s},\n"; - break; - case benchmark_metric::perf_tflop_sec: - os << "ylabel={Berformance in tFLOP/s},\n"; - break; - case benchmark_metric::runtime_sec: - os << "ylabel={Runtime in seconds},\n"; - break; - default: - throw std::runtime_error("Unsupported metric"); - } - - os << "title={" << title << "},\n" - << "]"; - - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << "\\addplot table[x=threads,y=" - << (*it)->get_label() - << "]{\\data" - << (*it)->get_label() - << "};\n"; - - os << "\\legend{"; - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); - os << "}\n" - - << "\\end{axis}\n" - - << "\\node[below right, align=left, text=black]\n" - << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" - << "G+Smo " << GISMO_VERSION - << ", Eigen " << EIGEN_WORLD_VERSION - << "." << EIGEN_MAJOR_VERSION - << "." << EIGEN_MINOR_VERSION << "\\\\\n" - << "And another line of text here};\n" - - << "\\end{tikzpicture}\n"; - - return os; - } - - private: - char id; - const std::string label,title; - std::vector< result_set* > results; - }; - -public: - ~benchmark_latex() - { - for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) - delete (*it); - } - - benchmark_set* add_benchmark(const std::string& label, - const std::string& title) - { - benchmarks.emplace_back(new benchmark_set(label, title)); - return benchmarks.back(); - } - - const std::vector< benchmark_set* >& get_benchmarks() const - { return benchmarks; } - - std::ostream &print(std::ostream &os) const - { - os << "\\documentclass[tikz]{standalone}\n" - << "\\usepackage{pgfplots}\n" - << "\\begin{document}\n" - << "\\usetikzlibrary{calc}\n"; - - for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) - (*it)->print(os); - - os << "\\end{document}\n"; - return os; - } - -private: - std::vector< benchmark_set* > benchmarks; -}; - -/// Print (as string) operator -std::ostream &operator<<(std::ostream &os, const benchmark_latex& obj) -{ return obj.print(os); } +//! [Implement benchmarks] /** * Benchmark: native C array memcopy */ @@ -465,134 +188,445 @@ class benchmark_c_array_dense_matmul m_y[i] = sum; } - // Needed to make sure the compiler does not eliminate this code block - T tmp = m_y[n-1]; - GISMO_UNUSED(tmp); + // Needed to make sure the compiler does not eliminate this code block + T tmp = m_y[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * (2*n*n + n); + } +}; + +/** + * Benchmark: Eigen vector memcopy + */ +template +class benchmark_eigen_vector_memcopy +{ +private: + std::size_t n; + gsVector x,y; + +public: + benchmark_eigen_vector_memcopy(std::size_t n) + : n(n), x(n), y(n) + { + x.fill((T)0.0); + } + + std::size_t operator()() + { + y.noalias() = x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * 2 * n; + } +}; + +/** + * Benchmark: Eigen vector dot-product + */ +template +class benchmark_eigen_vector_dotproduct +{ +private: + std::size_t n; + gsVector x, y; + +public: + benchmark_eigen_vector_dotproduct(std::size_t n) + : n(n), x(n), y(n) + { + x.fill((T)0.0); + y.fill((T)0.0); + } + + std::size_t operator()() + { + volatile T sum = y.dot(x); + GISMO_UNUSED(sum); + + return sizeof(T) * 2 * n; + } +}; + +/** + * Benchmark: Eigen vector AXPY + */ +template +class benchmark_eigen_vector_axpy +{ +private: + std::size_t n; + gsVector x, y, z; + +public: + benchmark_eigen_vector_axpy(std::size_t n) + : n(n), x(n), y(n), z(n) + { + x.fill((T)0.0); + y.fill((T)0.0); + } + + std::size_t operator()() + { + z.noalias() = (T)3.141*x + y; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = z[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * 3 * n; + } +}; + +/** + * Benchmark: Eigen dense matrix-vector multiplication + */ +template +class benchmark_eigen_vector_dense_matmul +{ +private: + std::size_t n; + gsMatrix A; + gsVector x, y; + +public: + benchmark_eigen_vector_dense_matmul(std::size_t n) + : n(n), A(n,n), x(n), y(n) + { + A.fill(0.0); + x.fill(0.0); + } + + std::size_t operator()() + { + y.noalias() = A*x; + + // Needed to make sure the compiler does not eliminate this code block + T tmp = y[n-1]; + GISMO_UNUSED(tmp); + + return sizeof(T) * (2*n*n + n); + } +}; +//! [Implement benchmarks] + +//! [Implement benchmark infrastructure] + +/** + * Benchmark metrics + */ +enum class benchmark_metric { + bandwidth_kb_sec, + bandwidth_mb_sec, + bandwidth_gb_sec, + bandwidth_tb_sec, + perf_kflop_sec, + perf_mflop_sec, + perf_gflop_sec, + perf_tflop_sec, + runtime_sec, +}; + +/** + * Benchmark: driver function + */ +template +std::vector< std::array > +benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark, benchmark_metric metric) +{ + gsStopwatch stopwatch; + std::size_t benchmark_result; + double benchmark_metric, benchmark_runtime; + + std::vector< std::array > results; + + try { + for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { + + omp_set_num_threads(*it); + benchmark_runtime = 0.0; + benchmark_metric = 0.0; + + for (int run=0; run(*it) /* number of OpenMP threads */, + benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, + benchmark_metric/(double)nruns /* averaged benchmark metric */, + (double)metric} /* benchmark metric */ ); + } + } catch(...) {} + + return results; +} + +/** + * Benchmark LaTeX output + */ +class benchmark_latex +{ +public: + /** + * Result set class + */ + class result_set + { + public: + result_set(const std::string& label, + const std::string& title, + const std::vector< std::array >& results) + : label(label), + title(title), + results(results) + { + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector< std::array >& get_results() const + { return results; } + + std::ostream &print(std::ostream &os) const + { + os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" + << "threads & " << label << " \\\\\n"; + + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; - return sizeof(T) * (2*n*n + n); - } -}; + os << "}\\data" << label << "\n"; -/** - * Benchmark: Eigen vector memcopy - */ -template -class benchmark_eigen_vector_memcopy -{ -private: - std::size_t n; - gsVector x,y; + return os; + } -public: - benchmark_eigen_vector_memcopy(std::size_t n) - : n(n), x(n), y(n) - { - x.fill((T)0.0); - } + private: + const std::string label, title; + std::vector< std::array > results; + }; - std::size_t operator()() + /** + * Benchmark set class + */ + class benchmark_set { - y.noalias() = x; - - // Needed to make sure the compiler does not eliminate this code block - T tmp = y[n-1]; - GISMO_UNUSED(tmp); + public: + benchmark_set(const std::string& label, + const std::string& title) + : id('A'), + label(label), + title(title) + {} - return sizeof(T) * 2 * n; - } -}; + ~benchmark_set() + { + for (auto it=results.begin(); it!=results.end(); ++it) + delete (*it); + } -/** - * Benchmark: Eigen vector dot-product - */ -template -class benchmark_eigen_vector_dotproduct -{ -private: - std::size_t n; - gsVector x, y; + void add_results(const std::string& label, + const std::string& title, + const std::vector< std::array >& results) + { + this->results.emplace_back(new result_set(label+std::string(1,id++), title, results)); + } -public: - benchmark_eigen_vector_dotproduct(std::size_t n) - : n(n), x(n), y(n) - { - x.fill((T)0.0); - y.fill((T)0.0); - } + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } - std::size_t operator()() - { - volatile T sum = y.dot(x); - GISMO_UNUSED(sum); + const std::vector& get_results() const + { return results; } - return sizeof(T) * 2 * n; - } -}; + std::ostream &print(std::ostream &os) const + { + for (auto it=results.cbegin(); it!=results.cend(); ++it) + (*it)->print(os); -/** - * Benchmark: Eigen vector AXPY - */ -template -class benchmark_eigen_vector_axpy -{ -private: - std::size_t n; - gsVector x, y, z; + os << "\\begin{tikzpicture}\n" + << "\\begin{axis}[\n" + << "name=MyAxis,\n" + << "width=\\textwidth,\n" + << "height=.5\\textwidth,\n" + << "legend pos=outer north east,\n" + + << "symbolic x coords={"; + + for (auto it=(*results.cbegin())->get_results().cbegin(); + it!=(*results.cbegin())->get_results().cend(); ++it) + os << (*it)[0] << (it!=(*results.cbegin())->get_results().cend()-1 ? "," : ""); + os << "},\n" + + << "xlabel={OpenMP threads},\n"; + + switch((benchmark_metric)(*(*results.cbegin())->get_results().cbegin())[4]) { + case benchmark_metric::bandwidth_kb_sec: + os << "ylabel={Bandwidth in KB/s},\n"; + break; + case benchmark_metric::bandwidth_mb_sec: + os << "ylabel={Bandwidth in MB/s},\n"; + break; + case benchmark_metric::bandwidth_gb_sec: + os << "ylabel={Bandwidth in GB/s},\n"; + break; + case benchmark_metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth in TB/s},\n"; + break; + case benchmark_metric::perf_kflop_sec: + os << "ylabel={Berformance in kFLOP/s},\n"; + break; + case benchmark_metric::perf_mflop_sec: + os << "ylabel={Berformance in mFLOP/s},\n"; + break; + case benchmark_metric::perf_gflop_sec: + os << "ylabel={Berformance in gFLOP/s},\n"; + break; + case benchmark_metric::perf_tflop_sec: + os << "ylabel={Berformance in tFLOP/s},\n"; + break; + case benchmark_metric::runtime_sec: + os << "ylabel={Runtime in seconds},\n"; + break; + default: + throw std::runtime_error("Unsupported metric"); + } + + os << "title={" << title << "},\n" + << "]"; -public: - benchmark_eigen_vector_axpy(std::size_t n) - : n(n), x(n), y(n), z(n) - { - x.fill((T)0.0); - y.fill((T)0.0); - } + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << "\\addplot table[x=threads,y=" + << (*it)->get_label() + << "]{\\data" + << (*it)->get_label() + << "};\n"; - std::size_t operator()() - { - z.noalias() = (T)3.141*x + y; + os << "\\legend{"; + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); + os << "}\n" + + << "\\end{axis}\n" - // Needed to make sure the compiler does not eliminate this code block - T tmp = z[n-1]; - GISMO_UNUSED(tmp); + << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " + << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" + << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" + << "G+Smo " << gsCmdLine::getGismoVersion() + << ", Eigen " << gsCmdLine::getEigenVersion() + << " (" << gsCmdLine::getCompilerVersion() + << ", " << gsCmdLine::getCppVersion() + << ", " << gsCmdLine::getStdLibVersion() + << (gsCmdLine::getExtraLibsVersion().empty() + ? "), \n" + : gsCmdLine::getExtraLibsVersion()+"), \n") + + << "CPU " << gsCmdLine::getCpuInfo() << ", " + << "Memory " << gsCmdLine::getMemoryInfo() << ", "; + + gsJITCompilerConfig jit; jit.load("config/jit.xml"); + std::string flags = jit.getFlags(); + os << "Compiler flags "; + + for (auto token=strtok(&flags[0], " "); token!=NULL; token=strtok(NULL, " ")) { + if (token[0]=='-') { + if (token[1]=='I' || token[1]=='L' || token[1]=='l' || token[1]=='W') + continue; + os << "\\verb!" << token << "! "; + } + } + + os << "};\n" + << "\\end{tikzpicture}\n"; - return sizeof(T) * 3 * n; - } -}; + return os; + } -/** - * Benchmark: Eigen dense matrix-vector multiplication - */ -template -class benchmark_eigen_vector_dense_matmul -{ -private: - std::size_t n; - gsMatrix A; - gsVector x, y; + private: + char id; + const std::string label,title; + std::vector< result_set* > results; + }; public: - benchmark_eigen_vector_dense_matmul(std::size_t n) - : n(n), A(n,n), x(n), y(n) + ~benchmark_latex() + { + for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) + delete (*it); + } + + benchmark_set* add_benchmark(const std::string& label, + const std::string& title) { - A.fill(0.0); - x.fill(0.0); + benchmarks.emplace_back(new benchmark_set(label, title)); + return benchmarks.back(); } - std::size_t operator()() + const std::vector< benchmark_set* >& get_benchmarks() const + { return benchmarks; } + + std::ostream &print(std::ostream &os) const { - y.noalias() = A*x; + os << "\\documentclass[tikz]{standalone}\n" + << "\\usepackage{pgfplots}\n" + << "\\usepackage{verbatim}\n" + << "\\begin{document}\n" + << "\\usetikzlibrary{calc}\n"; - // Needed to make sure the compiler does not eliminate this code block - T tmp = y[n-1]; - GISMO_UNUSED(tmp); + for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) + (*it)->print(os); - return sizeof(T) * (2*n*n + n); + os << "\\end{document}\n"; + return os; } + +private: + std::vector< benchmark_set* > benchmarks; }; +/// Print (as string) operator +std::ostream &operator<<(std::ostream &os, const benchmark_latex& obj) +{ return obj.print(os); } +//! [Implement benchmark infrastructure] + + int main(int argc, char *argv[]) { //! [Parse command line] + benchmark_latex latex; std::vector nthreads, ssizes, dsizes, vsizes; std::string fn; int nruns=1; @@ -608,7 +642,9 @@ int main(int argc, char *argv[]) cmd.addString("o", "output", "Name of the output file", fn); try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } + //! [Parse command line] + //! [Default configuration] // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(int i=1; i<=omp_get_max_threads(); i*=2) @@ -640,9 +676,9 @@ int main(int argc, char *argv[]) vsizes.push_back(1e5); vsizes.push_back(1e6); } + //! [Default configuration] - benchmark_latex latex; - + //! [Execute benchmarks] { auto bm = latex.add_benchmark("memcopy", "memory copy"); { @@ -653,7 +689,7 @@ int main(int argc, char *argv[]) benchmark_c_array_memcopy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativememcopy", - "native("+std::to_string(*it)+")", + "native("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -667,7 +703,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_memcopy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenmemcopy", - "eigen("+std::to_string(*it)+")", + "eigen("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -684,7 +720,7 @@ int main(int argc, char *argv[]) benchmark_c_array_dotproduct benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativedotproduct", - "native("+std::to_string(*it)+")", + "native("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -698,7 +734,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_dotproduct benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigendotproduct", - "eigen("+std::to_string(*it)+")", + "eigen("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -715,7 +751,7 @@ int main(int argc, char *argv[]) benchmark_c_array_axpy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativeaxpy", - "native("+std::to_string(*it)+")", + "native("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -729,7 +765,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_axpy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenaxpy", - "eigen("+std::to_string(*it)+")", + "eigen("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -746,7 +782,7 @@ int main(int argc, char *argv[]) benchmark_c_array_dense_matmul benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativdensemvmul", - "native("+std::to_string(*it)+")", + "native("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } @@ -760,18 +796,22 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_dense_matmul benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenmvmul", - "eigen("+std::to_string(*it)+")", + "eigen("+util::to_string((double)*it,0)+")", results); } catch(...) { gsInfo << "failed!"; } } } } - + if (fn.empty()) gsInfo << latex << "\n"; else { - //gsFileData<> fd; fd << latex << "\n"; fd.save(fn); + std::ofstream file; + file.open(fn); + file << latex << "\n"; + file.close(); } - + //! [Execute benchmarks] + return EXIT_SUCCESS; } From 09ff7bf819b4e59f02522b8a622b911a8fda1ccb Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 14:44:24 +0100 Subject: [PATCH 052/174] Updated performance benchmark application --- examples/performance_benchmark.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 8f04c09e1c..6fbb0cecd1 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -478,7 +478,7 @@ class benchmark_latex (*it)->print(os); os << "\\begin{tikzpicture}\n" - << "\\begin{axis}[\n" + << "\\begin{semilogyaxis}[\n" << "name=MyAxis,\n" << "width=\\textwidth,\n" << "height=.5\\textwidth,\n" @@ -540,7 +540,7 @@ class benchmark_latex os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); os << "}\n" - << "\\end{axis}\n" + << "\\end{semilogyaxis}\n" << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" @@ -675,6 +675,8 @@ int main(int argc, char *argv[]) vsizes.push_back(1e4); vsizes.push_back(1e5); vsizes.push_back(1e6); + vsizes.push_back(1e7); + vsizes.push_back(1e8); } //! [Default configuration] @@ -689,7 +691,7 @@ int main(int argc, char *argv[]) benchmark_c_array_memcopy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativememcopy", - "native("+util::to_string((double)*it,0)+")", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -703,7 +705,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_memcopy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenmemcopy", - "eigen("+util::to_string((double)*it,0)+")", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -720,7 +722,7 @@ int main(int argc, char *argv[]) benchmark_c_array_dotproduct benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativedotproduct", - "native("+util::to_string((double)*it,0)+")", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -734,7 +736,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_dotproduct benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigendotproduct", - "eigen("+util::to_string((double)*it,0)+")", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -751,7 +753,7 @@ int main(int argc, char *argv[]) benchmark_c_array_axpy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativeaxpy", - "native("+util::to_string((double)*it,0)+")", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -765,7 +767,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_axpy benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenaxpy", - "eigen("+util::to_string((double)*it,0)+")", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -782,7 +784,7 @@ int main(int argc, char *argv[]) benchmark_c_array_dense_matmul benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("nativdensemvmul", - "native("+util::to_string((double)*it,0)+")", + "native("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } @@ -796,7 +798,7 @@ int main(int argc, char *argv[]) benchmark_eigen_vector_dense_matmul benchmark(*it); auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); bm->add_results("eigenmvmul", - "eigen("+util::to_string((double)*it,0)+")", + "eigen("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", results); } catch(...) { gsInfo << "failed!"; } } From 8fcc1c61115877c5174d8bc83496d90605902c73 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Fri, 3 Dec 2021 16:33:15 +0100 Subject: [PATCH 053/174] Fixed small bugs in getCpuInfo and getMemoryInfo under Linux/x86_64 --- src/gsIO/gsCmdLine.cpp | 4 +++- src/gsIO/gsCmdLine.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index ac5decea57..c46afaf6f4 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -429,6 +429,8 @@ void gsCmdLine::printVersion() << ", " << getStdLibVersion() << ", eigen " << getEigenVersion() << (getExtraLibsVersion().empty() ? ")\n" : getExtraLibsVersion()+")\n"); + gsInfo << "Running on " << getCpuInfo() + << " (memory " << getMemoryInfo() << ")\n"; gsInfo << "web: http://github.com/gismo\n"; } @@ -943,7 +945,7 @@ std::string gsCmdLine::getCpuInfo() } #elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) char CPUBrandString[0x40]; unsigned int CPUInfo[4] = {0,0,0,0}; diff --git a/src/gsIO/gsCmdLine.h b/src/gsIO/gsCmdLine.h index 742fe66645..cfe1bcb011 100644 --- a/src/gsIO/gsCmdLine.h +++ b/src/gsIO/gsCmdLine.h @@ -21,6 +21,7 @@ #elif __linux__ # if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) # include +# include # endif #elif __unix__ #endif From 349c3cd9004d04bbfed7c3be6e360c380f9972e1 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 5 Dec 2021 11:16:28 +0100 Subject: [PATCH 054/174] Moved system information functionality from gsCmpLine into gsSysInfo --- src/gsCore/gsJITCompiler.h | 5 +- src/gsCore/gsSysInfo.cpp | 590 +++++++++++++++++++++++++++++++++++++ src/gsCore/gsSysInfo.h | 64 ++++ src/gsIO/gsCmdLine.cpp | 589 +----------------------------------- src/gsIO/gsCmdLine.h | 38 --- 5 files changed, 668 insertions(+), 618 deletions(-) create mode 100644 src/gsCore/gsSysInfo.cpp create mode 100644 src/gsCore/gsSysInfo.h diff --git a/src/gsCore/gsJITCompiler.h b/src/gsCore/gsJITCompiler.h index af7f18ccae..ca2371dcb9 100644 --- a/src/gsCore/gsJITCompiler.h +++ b/src/gsCore/gsJITCompiler.h @@ -16,8 +16,9 @@ #pragma once -#include +#include #include +#include #if defined(_WIN32) #include @@ -27,6 +28,8 @@ #include +#include + namespace gismo { /** diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp new file mode 100644 index 0000000000..5bd2384459 --- /dev/null +++ b/src/gsCore/gsSysInfo.cpp @@ -0,0 +1,590 @@ +/** @file gsSysInfo.cpp + + @brief Provides implemementation of system information. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include + +namespace gismo +{ + + std::string gsSysInfo::getGismoVersion() + { + return util::to_string(GISMO_VERSION); + } + + std::string gsSysInfo::getEigenVersion() + { + return util::to_string(EIGEN_WORLD_VERSION)+"." + + util::to_string(EIGEN_MAJOR_VERSION)+"." + + util::to_string(EIGEN_MINOR_VERSION); + } + + std::string gsSysInfo::getCompilerVersion() + { + // This code is copied from the CMakeCXXCompilerId.cpp file that was + // automatically generated with CMake 3.21.4 + + // The following two macros have been modified as we do not want to + // return the compiler version in the specific CMake format +#define DEC(n) n +#define HEX(n) n + + /* Version number components: V=Version, R=Revision, P=Patch + Version date components: YYYY=Year, MM=Month, DD=Day */ + +#if defined(__COMO__) +# define COMPILER_ID "Comeau" + /* __COMO_VERSION__ = VRR */ +# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100) +# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100) + +#elif defined(__INTEL_COMPILER) || defined(__ICC) +# define COMPILER_ID "Intel" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# if defined(__GNUC__) +# define SIMULATE_ID "GNU" +# endif + /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later, + except that a few beta releases use the old format with V=2021. */ +# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111 +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) +# if defined(__INTEL_COMPILER_UPDATE) +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) +# else +# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) +# endif +# else +# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER) +# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE) + /* The third version component from --version is an update index, + but no macro is provided for it. */ +# define COMPILER_VERSION_PATCH DEC(0) +# endif +# if defined(__INTEL_COMPILER_BUILD_DATE) + /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ +# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) +# endif +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif +# if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) +# elif defined(__GNUG__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +# endif +# if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +# endif +# if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +# endif + +#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER) +# define COMPILER_ID "IntelLLVM" +#if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +#endif +#if defined(__GNUC__) +# define SIMULATE_ID "GNU" +#endif + /* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and + * later. Look for 6 digit vs. 8 digit version number to decide encoding. + * VVVV is no smaller than the current year when a version is released. + */ +#if __INTEL_LLVM_COMPILER < 1000000L +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10) +#else +# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000) +# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100) +#endif +#if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +#endif +#if defined(__GNUC__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) +#elif defined(__GNUG__) +# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) +#endif +#if defined(__GNUC_MINOR__) +# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) +#endif +#if defined(__GNUC_PATCHLEVEL__) +# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +#endif + +#elif defined(__PATHCC__) +# define COMPILER_ID "PathScale" +# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) +# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) +# if defined(__PATHCC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) +# endif + +#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) +# define COMPILER_ID "Embarcadero" +# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) +# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) +# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) + +#elif defined(__BORLANDC__) +# define COMPILER_ID "Borland" + /* __BORLANDC__ = 0xVRR */ +# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) +# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) + +#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 +# define COMPILER_ID "Watcom" + /* __WATCOMC__ = VVRR */ +# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__WATCOMC__) +# define COMPILER_ID "OpenWatcom" + /* __WATCOMC__ = VVRP + 1100 */ +# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) +# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) +# if (__WATCOMC__ % 10) > 0 +# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) +# endif + +#elif defined(__SUNPRO_CC) +# define COMPILER_ID "SunPro" +# if __SUNPRO_CC >= 0x5100 + /* __SUNPRO_CC = 0xVRRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# else + /* __SUNPRO_CC = 0xVRP */ +# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8) +# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF) +# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) +# endif + +#elif defined(__HP_aCC) +# define COMPILER_ID "HP" + /* __HP_aCC = VVRRPP */ +# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000) +# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100) +# define COMPILER_VERSION_PATCH DEC(__HP_aCC % 100) + +#elif defined(__DECCXX) +# define COMPILER_ID "Compaq" + /* __DECCXX_VER = VVRRTPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000) +# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000 % 100) +# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER % 10000) + +#elif defined(__IBMCPP__) && defined(__COMPILER_VER__) +# define COMPILER_ID "zOS" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__ibmxl__) && defined(__clang__) +# define COMPILER_ID "XLClang" +# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__) +# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__) +# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__) +# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__) + + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800 +# define COMPILER_ID "XL" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800 +# define COMPILER_ID "VisualAge" + /* __IBMCPP__ = VRP */ +# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) +# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) +# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) + +#elif defined(__NVCOMPILER) +# define COMPILER_ID "NVHPC" +# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__) +# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__) +# if defined(__NVCOMPILER_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__) +# endif + +#elif defined(__PGI) +# define COMPILER_ID "PGI" +# define COMPILER_VERSION_MAJOR DEC(__PGIC__) +# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) +# if defined(__PGIC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) +# endif + +#elif defined(_CRAYC) +# define COMPILER_ID "Cray" +# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) +# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) + +#elif defined(__TI_COMPILER_VERSION__) +# define COMPILER_ID "TI" + /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ +# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) +# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) +# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) + +#elif defined(__CLANG_FUJITSU) +# define COMPILER_ID "FujitsuClang" +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# define COMPILER_VERSION_INTERNAL_STR __clang_version__ + + +#elif defined(__FUJITSU) +# define COMPILER_ID "Fujitsu" +# if defined(__FCC_version__) +# define COMPILER_VERSION __FCC_version__ +# elif defined(__FCC_major__) +# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) +# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) +# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) +# endif +# if defined(__fcc_version) +# define COMPILER_VERSION_INTERNAL DEC(__fcc_version) +# elif defined(__FCC_VERSION) +# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION) +# endif + + +#elif defined(__ghs__) +# define COMPILER_ID "GHS" + /* __GHS_VERSION_NUMBER = VVVVRP */ +# ifdef __GHS_VERSION_NUMBER +# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100) +# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10) +# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10) +# endif + +#elif defined(__SCO_VERSION__) +# define COMPILER_ID "SCO" + +#elif defined(__ARMCC_VERSION) && !defined(__clang__) +# define COMPILER_ID "ARMCC" +#if __ARMCC_VERSION >= 1000000 + /* __ARMCC_VERSION = VRRPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) +# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) +# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) +#else + /* __ARMCC_VERSION = VRPPPP */ +# define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) +# define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) +# define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) +#endif + + +#elif defined(__clang__) && defined(__apple_build_version__) +# define COMPILER_ID "AppleClang" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# define COMPILER_VERSION_MAJOR DEC(__clang_major__) +# define COMPILER_VERSION_MINOR DEC(__clang_minor__) +# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif +# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__) + +#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION) +# define COMPILER_ID "ARMClang" +# define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000) +# define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100) +# define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION % 10000) +# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION) + +#elif defined(__clang__) +# define COMPILER_ID "Clang" +# if defined(_MSC_VER) +# define SIMULATE_ID "MSVC" +# endif +# define COMPILER_VERSION_MAJOR DEC(__clang_major__) +# define COMPILER_VERSION_MINOR DEC(__clang_minor__) +# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) +# if defined(_MSC_VER) + /* _MSC_VER = VVRR */ +# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) +# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) +# endif + +#elif defined(__GNUC__) || defined(__GNUG__) +# define COMPILER_ID "GNU" +# if defined(__GNUC__) +# define COMPILER_VERSION_MAJOR DEC(__GNUC__) +# else +# define COMPILER_VERSION_MAJOR DEC(__GNUG__) +# endif +# if defined(__GNUC_MINOR__) +# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__) +# endif +# if defined(__GNUC_PATCHLEVEL__) +# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) +# endif + +#elif defined(_MSC_VER) +# define COMPILER_ID "MSVC" + /* _MSC_VER = VVRR */ +# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100) +# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100) +# if defined(_MSC_FULL_VER) +# if _MSC_VER >= 1400 + /* _MSC_FULL_VER = VVRRPPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) +# else + /* _MSC_FULL_VER = VVRRPPPP */ +# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) +# endif +# endif +# if defined(_MSC_BUILD) +# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) +# endif + +#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) +# define COMPILER_ID "ADSP" +#if defined(__VISUALDSPVERSION__) + /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ +# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) +# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) +# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) +#endif + +#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC) +# define COMPILER_ID "IAR" +# if defined(__VER__) && defined(__ICCARM__) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000) +# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000) +# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__)) +# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100) +# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100)) +# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__) +# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) +# endif + + + /* These compilers are either not known or too old to define an + identification macro. Try to identify the platform and guess that + it is the native compiler. */ +#elif defined(__hpux) || defined(__hpua) +# define COMPILER_ID "HP" + +#else /* unknown compiler */ +# define COMPILER_ID "Unknown-Compiler" +#endif + + return util::to_string(COMPILER_ID) +#ifdef COMPILER_VERSION + +" "+util::to_string(COMPILER_VERSION); +#elif defined(COMPILER_VERSION_MAJOR) + +" "+util::to_string(COMPILER_VERSION_MAJOR) +# ifdef COMPILER_VERSION_MINOR + +"."+util::to_string(COMPILER_VERSION_MINOR) +# ifdef COMPILER_VERSION_PATCH + +"."+util::to_string(COMPILER_VERSION_PATCH) +# ifdef COMPILER_VERSION_TWEAK + +"."+util::to_string(COMPILER_VERSION_TWEAK) +# endif +# endif +# endif + ; +#endif + +#undef DEC +#undef HEX +#undef COMPILER_ID +#undef COMPILER_VERSION +#undef COMPILER_VERSION_MAJOR +#undef COMPILER_VERSION_MINOR +#undef COMPILER_VERSION_PATCH +#undef COMPILER_VERSION_TWEAK +#undef SIMULATE_VERSION_MAJOR +#undef SIMULATE_VERSION_MINOR +#undef SIMULATE_VERSION_PATCH +#undef SIMULATE_VERSION_TWEAK + } + + std::string gsSysInfo::getCppVersion() + { +#if defined(_MSC_VER) && _MSC_VER < 1600 + return "C++ 199711L"; +#elsif _MSC_VER >= 1900 + return "C++ "+util::to_string(_MSVC_LANG); +#elsif _MSC_VER >= 1600 + return "C++ 201103L"; +#else + return "C++ "+util::to_string(__cplusplus); +#endif + } + + std::string gsSysInfo::getStdLibVersion() + { +#ifdef _LIBCPP_VERSION + return "libc++ "+util::to_string(_LIBCPP_VERSION); +# elif defined(__GLIBCXX__) + return "glibc++ "+util::to_string(__GLIBCXX__); +# elif defined(__GLIBCPP__) + return "glibc++ "+util::to_string(__GLIBCPP__); +#elif defined(__LIBCOMO__) + return "Comeau STL "+util::to_string(__LIBCOMO__); +# elif defined(__STL_CONFIG_H) + return "SGI STL"; +# elif defined(__MSL_CPP__) + return "MSL standard lib"; +# elif defined(__IBMCPP__) + return "VACPP STL"; +# elif defined(MSIPL_COMPILE_H) + return "Modena C++ STL"; +# elif (defined(_YVALS) && !defined(__IBMCPP__)) || defined(_CPPLIB_VER) + return "Dinkumware STL "+util::to_string(_CPPLIB_VER); +# elif defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER) + return "Rogue Wave lib "+util::to_string(_RWSTD_VER); +#else + return "Unknown-STD"; +#endif + } + + std::string gsSysInfo::getExtraLibsVersion() + { + std::string s(""); + +#ifdef __INTEL_MKL__ + s += "MKL "+INTEL_MKL_VERSION; +#endif + + return s; + } + + std::string gsSysInfo::getCpuInfo() + { +#if defined(_WIN32) || defined(_WIN64) + + int CPUInfo[4] = {-1}; + unsigned nExIds, i = 0; + char CPUBrandString[0x40]; + + __cpuid(CPUInfo, 0x80000000); + nExIds = CPUInfo[0]; + + for (i=0x80000000; i<=nExIds; ++i) { + __cpuid(CPUInfo, i); + if (i == 0x80000002) + memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); + } + + return CPUBrandString; + +#elif __APPLE__ + + std::string CPUBrandString; + std::size_t size = 32; + + // Supply an oversized buffer, and avoid an extra call to sysctlbyname. + CPUBrandString.resize(size); + if (sysctlbyname("machdep.cpu.brand_string", &CPUBrandString[0], &size, NULL, 0) == 0 && size > 0) { + if (CPUBrandString[size-1] == '\0') + size--; + CPUBrandString.resize(size); + return CPUBrandString; + } + +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) + + char CPUBrandString[0x40]; + unsigned int CPUInfo[4] = {0,0,0,0}; + + __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + unsigned int nExIds = CPUInfo[0]; + + memset(CPUBrandString, 0, sizeof(CPUBrandString)); + + for (unsigned int i = 0x80000000; i <= nExIds; ++i) + { + __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + + if (i == 0x80000002) + memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000003) + memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); + else if (i == 0x80000004) + memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); + } + + return CPUBrandString; + +# endif +#elif __unix__ +#endif + + return "Unknown-CPU"; + } + + std::string gsSysInfo::getMemoryInfo() + { + +#if defined(_WIN32) || defined(_WIN64) + + +#elif __APPLE__ + + int64_t memsize; + std::size_t size = sizeof(memsize); + + if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { + return util::to_string(memsize / 1024 / 1024)+" MB"; + } + +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) + + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGE_SIZE); + return util::to_string(pages * page_size / 1024 / 1024)+" MB"; + +# endif +#elif __unix__ +#endif + + return "Unknown-Memory"; + } + +} // namespace gismo diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h new file mode 100644 index 0000000000..68a33b6bd7 --- /dev/null +++ b/src/gsCore/gsSysInfo.h @@ -0,0 +1,64 @@ +/** @file gsSysInfo.h + + @brief Provides system information. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) +# include +# include +# endif +#elif __unix__ +#endif + +namespace gismo +{ + + class GISMO_EXPORT gsSysInfo + { + public: + + /// Returns the version of G+Smo + static std::string getGismoVersion(); + + /// Returns the version of Eigen + static std::string getEigenVersion(); + + /// Returns the version of the compiler + static std::string getCompilerVersion(); + + /// Returns the version of the C++ standard + static std::string getCppVersion(); + + /// Returns the version of the standard library + static std::string getStdLibVersion(); + + /// Returns the version of extra libraries + static std::string getExtraLibsVersion(); + + /// Returns CPU information + static std::string getCpuInfo(); + + /// Returns memory information + static std::string getMemoryInfo(); + }; // class gsSysInfo + +} // namespace gismo diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index c46afaf6f4..bc989c6992 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -23,6 +23,7 @@ //#include // --- end External files +#include #include namespace gismo @@ -423,588 +424,18 @@ void gsCmdLine::printVersion() gsInfo << "\n"; gsInfo << " G+Smo \n"; gsInfo << " Geometry plus Simulation modules\n"; - gsInfo << " version "<< getGismoVersion() << "\n"; - gsInfo << "Compiled by " << getCompilerVersion() - << " (" << getCppVersion() - << ", " << getStdLibVersion() - << ", eigen " << getEigenVersion() - << (getExtraLibsVersion().empty() ? ")\n" : getExtraLibsVersion()+")\n"); - gsInfo << "Running on " << getCpuInfo() - << " (memory " << getMemoryInfo() << ")\n"; + gsInfo << " version "<< gsSysInfo::getGismoVersion() << "\n"; + gsInfo << "Compiled by " << gsSysInfo::getCompilerVersion() + << " (" << gsSysInfo::getCppVersion() + << ", " << gsSysInfo::getStdLibVersion() + << ", eigen " << gsSysInfo::getEigenVersion() + << (gsSysInfo::getExtraLibsVersion().empty() ? ")\n" + : gsSysInfo::getExtraLibsVersion()+")\n"); + gsInfo << "Running on " << gsSysInfo::getCpuInfo() + << " (memory " << gsSysInfo::getMemoryInfo() << ")\n"; gsInfo << "web: http://github.com/gismo\n"; } -std::string gsCmdLine::getGismoVersion() -{ - return util::to_string(GISMO_VERSION); -} - -std::string gsCmdLine::getEigenVersion() -{ - return util::to_string(EIGEN_WORLD_VERSION)+"." - + util::to_string(EIGEN_MAJOR_VERSION)+"." - + util::to_string(EIGEN_MINOR_VERSION); -} - -std::string gsCmdLine::getCompilerVersion() -{ - // This code is copied from the CMakeCXXCompilerId.cpp file that was - // automatically generated with CMake 3.21.4 - - // The following two macros have been modified as we do not want to - // return the compiler version in the specific CMake format -#define DEC(n) n -#define HEX(n) n - -/* Version number components: V=Version, R=Revision, P=Patch - Version date components: YYYY=Year, MM=Month, DD=Day */ - -#if defined(__COMO__) -# define COMPILER_ID "Comeau" - /* __COMO_VERSION__ = VRR */ -# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100) -# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100) - -#elif defined(__INTEL_COMPILER) || defined(__ICC) -# define COMPILER_ID "Intel" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# if defined(__GNUC__) -# define SIMULATE_ID "GNU" -# endif - /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later, - except that a few beta releases use the old format with V=2021. */ -# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111 -# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100) -# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10) -# if defined(__INTEL_COMPILER_UPDATE) -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE) -# else -# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10) -# endif -# else -# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER) -# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE) - /* The third version component from --version is an update index, - but no macro is provided for it. */ -# define COMPILER_VERSION_PATCH DEC(0) -# endif -# if defined(__INTEL_COMPILER_BUILD_DATE) - /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */ -# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE) -# endif -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif -# if defined(__GNUC__) -# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) -# elif defined(__GNUG__) -# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) -# endif -# if defined(__GNUC_MINOR__) -# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) -# endif -# if defined(__GNUC_PATCHLEVEL__) -# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) -# endif - -#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER) -# define COMPILER_ID "IntelLLVM" -#if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -#endif -#if defined(__GNUC__) -# define SIMULATE_ID "GNU" -#endif -/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and - * later. Look for 6 digit vs. 8 digit version number to decide encoding. - * VVVV is no smaller than the current year when a version is released. - */ -#if __INTEL_LLVM_COMPILER < 1000000L -# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100) -# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10) -#else -# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000) -# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100) -# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100) -#endif -#if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -#endif -#if defined(__GNUC__) -# define SIMULATE_VERSION_MAJOR DEC(__GNUC__) -#elif defined(__GNUG__) -# define SIMULATE_VERSION_MAJOR DEC(__GNUG__) -#endif -#if defined(__GNUC_MINOR__) -# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__) -#endif -#if defined(__GNUC_PATCHLEVEL__) -# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) -#endif - -#elif defined(__PATHCC__) -# define COMPILER_ID "PathScale" -# define COMPILER_VERSION_MAJOR DEC(__PATHCC__) -# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__) -# if defined(__PATHCC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__) -# endif - -#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__) -# define COMPILER_ID "Embarcadero" -# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF) -# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF) -# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF) - -#elif defined(__BORLANDC__) -# define COMPILER_ID "Borland" - /* __BORLANDC__ = 0xVRR */ -# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8) -# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF) - -#elif defined(__WATCOMC__) && __WATCOMC__ < 1200 -# define COMPILER_ID "Watcom" - /* __WATCOMC__ = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__WATCOMC__) -# define COMPILER_ID "OpenWatcom" - /* __WATCOMC__ = VVRP + 1100 */ -# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100) -# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10) -# if (__WATCOMC__ % 10) > 0 -# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10) -# endif - -#elif defined(__SUNPRO_CC) -# define COMPILER_ID "SunPro" -# if __SUNPRO_CC >= 0x5100 - /* __SUNPRO_CC = 0xVRRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) -# else - /* __SUNPRO_CC = 0xVRP */ -# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8) -# define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF) -# define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC & 0xF) -# endif - -#elif defined(__HP_aCC) -# define COMPILER_ID "HP" - /* __HP_aCC = VVRRPP */ -# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000) -# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100) -# define COMPILER_VERSION_PATCH DEC(__HP_aCC % 100) - -#elif defined(__DECCXX) -# define COMPILER_ID "Compaq" - /* __DECCXX_VER = VVRRTPPPP */ -# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000) -# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000 % 100) -# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER % 10000) - -#elif defined(__IBMCPP__) && defined(__COMPILER_VER__) -# define COMPILER_ID "zOS" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__ibmxl__) && defined(__clang__) -# define COMPILER_ID "XLClang" -# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__) -# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__) -# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__) -# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__) - - -#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800 -# define COMPILER_ID "XL" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800 -# define COMPILER_ID "VisualAge" - /* __IBMCPP__ = VRP */ -# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100) -# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10) -# define COMPILER_VERSION_PATCH DEC(__IBMCPP__ % 10) - -#elif defined(__NVCOMPILER) -# define COMPILER_ID "NVHPC" -# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__) -# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__) -# if defined(__NVCOMPILER_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__) -# endif - -#elif defined(__PGI) -# define COMPILER_ID "PGI" -# define COMPILER_VERSION_MAJOR DEC(__PGIC__) -# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__) -# if defined(__PGIC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__) -# endif - -#elif defined(_CRAYC) -# define COMPILER_ID "Cray" -# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR) -# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR) - -#elif defined(__TI_COMPILER_VERSION__) -# define COMPILER_ID "TI" - /* __TI_COMPILER_VERSION__ = VVVRRRPPP */ -# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000) -# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000) -# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000) - -#elif defined(__CLANG_FUJITSU) -# define COMPILER_ID "FujitsuClang" -# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) -# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) -# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) -# define COMPILER_VERSION_INTERNAL_STR __clang_version__ - - -#elif defined(__FUJITSU) -# define COMPILER_ID "Fujitsu" -# if defined(__FCC_version__) -# define COMPILER_VERSION __FCC_version__ -# elif defined(__FCC_major__) -# define COMPILER_VERSION_MAJOR DEC(__FCC_major__) -# define COMPILER_VERSION_MINOR DEC(__FCC_minor__) -# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__) -# endif -# if defined(__fcc_version) -# define COMPILER_VERSION_INTERNAL DEC(__fcc_version) -# elif defined(__FCC_VERSION) -# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION) -# endif - - -#elif defined(__ghs__) -# define COMPILER_ID "GHS" -/* __GHS_VERSION_NUMBER = VVVVRP */ -# ifdef __GHS_VERSION_NUMBER -# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100) -# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10) -# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10) -# endif - -#elif defined(__SCO_VERSION__) -# define COMPILER_ID "SCO" - -#elif defined(__ARMCC_VERSION) && !defined(__clang__) -# define COMPILER_ID "ARMCC" -#if __ARMCC_VERSION >= 1000000 - /* __ARMCC_VERSION = VRRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#else - /* __ARMCC_VERSION = VRPPPP */ - # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000) - # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10) - # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000) -#endif - - -#elif defined(__clang__) && defined(__apple_build_version__) -# define COMPILER_ID "AppleClang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif -# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__) - -#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION) -# define COMPILER_ID "ARMClang" - # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000) - # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100) - # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION % 10000) -# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION) - -#elif defined(__clang__) -# define COMPILER_ID "Clang" -# if defined(_MSC_VER) -# define SIMULATE_ID "MSVC" -# endif -# define COMPILER_VERSION_MAJOR DEC(__clang_major__) -# define COMPILER_VERSION_MINOR DEC(__clang_minor__) -# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__) -# if defined(_MSC_VER) - /* _MSC_VER = VVRR */ -# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100) -# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100) -# endif - -#elif defined(__GNUC__) || defined(__GNUG__) -# define COMPILER_ID "GNU" -# if defined(__GNUC__) -# define COMPILER_VERSION_MAJOR DEC(__GNUC__) -# else -# define COMPILER_VERSION_MAJOR DEC(__GNUG__) -# endif -# if defined(__GNUC_MINOR__) -# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__) -# endif -# if defined(__GNUC_PATCHLEVEL__) -# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__) -# endif - -#elif defined(_MSC_VER) -# define COMPILER_ID "MSVC" - /* _MSC_VER = VVRR */ -# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100) -# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100) -# if defined(_MSC_FULL_VER) -# if _MSC_VER >= 1400 - /* _MSC_FULL_VER = VVRRPPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000) -# else - /* _MSC_FULL_VER = VVRRPPPP */ -# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000) -# endif -# endif -# if defined(_MSC_BUILD) -# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD) -# endif - -#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__) -# define COMPILER_ID "ADSP" -#if defined(__VISUALDSPVERSION__) - /* __VISUALDSPVERSION__ = 0xVVRRPP00 */ -# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24) -# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF) -# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8 & 0xFF) -#endif - -#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC) -# define COMPILER_ID "IAR" -# if defined(__VER__) && defined(__ICCARM__) -# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000) -# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000) -# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000) -# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) -# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__)) -# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100) -# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100)) -# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__) -# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__) -# endif - - -/* These compilers are either not known or too old to define an - identification macro. Try to identify the platform and guess that - it is the native compiler. */ -#elif defined(__hpux) || defined(__hpua) -# define COMPILER_ID "HP" - -#else /* unknown compiler */ -# define COMPILER_ID "Unknown-Compiler" -#endif - - return util::to_string(COMPILER_ID) -#ifdef COMPILER_VERSION - +" "+util::to_string(COMPILER_VERSION); -#elif defined(COMPILER_VERSION_MAJOR) - +" "+util::to_string(COMPILER_VERSION_MAJOR) -# ifdef COMPILER_VERSION_MINOR - +"."+util::to_string(COMPILER_VERSION_MINOR) -# ifdef COMPILER_VERSION_PATCH - +"."+util::to_string(COMPILER_VERSION_PATCH) -# ifdef COMPILER_VERSION_TWEAK - +"."+util::to_string(COMPILER_VERSION_TWEAK) -# endif -# endif -# endif - ; -#endif - -#undef DEC -#undef HEX -#undef COMPILER_ID -#undef COMPILER_VERSION -#undef COMPILER_VERSION_MAJOR -#undef COMPILER_VERSION_MINOR -#undef COMPILER_VERSION_PATCH -#undef COMPILER_VERSION_TWEAK -#undef SIMULATE_VERSION_MAJOR -#undef SIMULATE_VERSION_MINOR -#undef SIMULATE_VERSION_PATCH -#undef SIMULATE_VERSION_TWEAK -} - -std::string gsCmdLine::getCppVersion() -{ -#if defined(_MSC_VER) && _MSC_VER < 1600 - return "C++ 199711L"; -#elsif _MSC_VER >= 1900 - return "C++ "+util::to_string(_MSVC_LANG); -#elsif _MSC_VER >= 1600 - return "C++ 201103L"; -#else - return "C++ "+util::to_string(__cplusplus); -#endif -} - -std::string gsCmdLine::getStdLibVersion() -{ -#ifdef _LIBCPP_VERSION - return "libc++ "+util::to_string(_LIBCPP_VERSION); -# elif defined(__GLIBCXX__) - return "glibc++ "+util::to_string(__GLIBCXX__); -# elif defined(__GLIBCPP__) - return "glibc++ "+util::to_string(__GLIBCPP__); -#elif defined(__LIBCOMO__) - return "Comeau STL "+util::to_string(__LIBCOMO__); -# elif defined(__STL_CONFIG_H) - return "SGI STL"; -# elif defined(__MSL_CPP__) - return "MSL standard lib"; -# elif defined(__IBMCPP__) - return "VACPP STL"; -# elif defined(MSIPL_COMPILE_H) - return "Modena C++ STL"; -# elif (defined(_YVALS) && !defined(__IBMCPP__)) || defined(_CPPLIB_VER) - return "Dinkumware STL "+util::to_string(_CPPLIB_VER); -# elif defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER) - return "Rogue Wave lib "+util::to_string(_RWSTD_VER); -#else - return "Unknown-STD"; -#endif -} - -std::string gsCmdLine::getExtraLibsVersion() -{ - std::string s(""); - -#ifdef __INTEL_MKL__ - s += "MKL "+INTEL_MKL_VERSION; -#endif - - return s; -} - -std::string gsCmdLine::getCpuInfo() -{ -#if defined(_WIN32) || defined(_WIN64) - - int CPUInfo[4] = {-1}; - unsigned nExIds, i = 0; - char CPUBrandString[0x40]; - - __cpuid(CPUInfo, 0x80000000); - nExIds = CPUInfo[0]; - - for (i=0x80000000; i<=nExIds; ++i) { - __cpuid(CPUInfo, i); - if (i == 0x80000002) - memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000003) - memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000004) - memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); - } - - return CPUBrandString; - -#elif __APPLE__ - - std::string CPUBrandString; - std::size_t size = 32; - - // Supply an oversized buffer, and avoid an extra call to sysctlbyname. - CPUBrandString.resize(size); - if (sysctlbyname("machdep.cpu.brand_string", &CPUBrandString[0], &size, NULL, 0) == 0 && size > 0) { - if (CPUBrandString[size-1] == '\0') - size--; - CPUBrandString.resize(size); - return CPUBrandString; - } - -#elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) - - char CPUBrandString[0x40]; - unsigned int CPUInfo[4] = {0,0,0,0}; - - __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); - unsigned int nExIds = CPUInfo[0]; - - memset(CPUBrandString, 0, sizeof(CPUBrandString)); - - for (unsigned int i = 0x80000000; i <= nExIds; ++i) - { - __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); - - if (i == 0x80000002) - memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000003) - memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); - else if (i == 0x80000004) - memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); - } - - return CPUBrandString; - -# endif -#elif __unix__ -#endif - - return "Unknown-CPU"; -} - -std::string gsCmdLine::getMemoryInfo() -{ - -#if defined(_WIN32) || defined(_WIN64) - - -#elif __APPLE__ - - int64_t memsize; - std::size_t size = sizeof(memsize); - - if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { - return util::to_string(memsize / 1024 / 1024)+" MB"; - } - -#elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) - - long pages = sysconf(_SC_PHYS_PAGES); - long page_size = sysconf(_SC_PAGE_SIZE); - return util::to_string(pages * page_size / 1024 / 1024)+" MB"; - -# endif -#elif __unix__ -#endif - - return "Unknown-Memory"; -} - std::string & gsCmdLine::getMessage() { return my->cmd.getMessage(); diff --git a/src/gsIO/gsCmdLine.h b/src/gsIO/gsCmdLine.h index cfe1bcb011..051328b8df 100644 --- a/src/gsIO/gsCmdLine.h +++ b/src/gsIO/gsCmdLine.h @@ -13,21 +13,7 @@ #pragma once -#if defined(_WIN32) || defined(_WIN64) -# include -#elif __APPLE__ -# include -# include -#elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) -# include -# include -# endif -#elif __unix__ -#endif - #include -#include namespace gismo { @@ -260,30 +246,6 @@ class GISMO_EXPORT gsCmdLine /// Prints the version information static void printVersion(); - - /// Returns the version of G+Smo - static std::string getGismoVersion(); - - /// Returns the version of Eigen - static std::string getEigenVersion(); - - /// Returns the version of the compiler - static std::string getCompilerVersion(); - - /// Returns the version of the C++ standard - static std::string getCppVersion(); - - /// Returns the version of the standard library - static std::string getStdLibVersion(); - - /// Returns the version of extra libraries - static std::string getExtraLibsVersion(); - - /// Returns CPU information - static std::string getCpuInfo(); - - /// Returns memory information - static std::string getMemoryInfo(); /// Returns the program's description (as specified in the constructor) std::string& getMessage(); From ae149d1cfabec1a5a30d8bafff10acb25b995610 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 5 Dec 2021 11:22:11 +0100 Subject: [PATCH 055/174] New general benchmarking framework --- src/gsIO/gsBenchmark.cpp | 150 +++++++++++++++++++++++++++ src/gsIO/gsBenchmark.h | 217 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 367 insertions(+) create mode 100644 src/gsIO/gsBenchmark.cpp create mode 100644 src/gsIO/gsBenchmark.h diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp new file mode 100644 index 0000000000..45007d6a8a --- /dev/null +++ b/src/gsIO/gsBenchmark.cpp @@ -0,0 +1,150 @@ +/** @file gsBenchmark.cpp + + @brief Provides implemementation of generic benchmarking framework. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#include + +namespace gismo +{ + + std::ostream &gsBenchmark::gsBenchmarkResultSet::print(std::ostream &os) const + { + os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" + << "threads & " << label << " \\\\\n"; + + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; + + os << "}\\data" << label << "\n"; + + return os; + } + + std::ostream &gsBenchmark::gsBenchmarkSet::print(std::ostream &os) const + { + for (auto it=results.cbegin(); it!=results.cend(); ++it) + (*it)->print(os); + + os << "\\begin{tikzpicture}\n" + << "\\begin{semilogyaxis}[\n" + << "name=MyAxis,\n" + << "width=\\textwidth,\n" + << "height=.5\\textwidth,\n" + << "legend pos=outer north east,\n" + + << "symbolic x coords={"; + + for (auto it=(*results.cbegin())->get().cbegin(); + it!=(*results.cbegin())->get().cend(); ++it) + os << (*it)[0] << (it!=(*results.cbegin())->get().cend()-1 ? "," : ""); + os << "},\n" + + << "xlabel={OpenMP threads},\n"; + + switch((metric)(*(*results.cbegin())->get().cbegin())[4]) { + case metric::bandwidth_kb_sec: + os << "ylabel={Bandwidth in KB/s},\n"; + break; + case metric::bandwidth_mb_sec: + os << "ylabel={Bandwidth in MB/s},\n"; + break; + case metric::bandwidth_gb_sec: + os << "ylabel={Bandwidth in GB/s},\n"; + break; + case metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth in TB/s},\n"; + break; + case metric::perf_kflop_sec: + os << "ylabel={Berformance in kFLOP/s},\n"; + break; + case metric::perf_mflop_sec: + os << "ylabel={Berformance in mFLOP/s},\n"; + break; + case metric::perf_gflop_sec: + os << "ylabel={Berformance in gFLOP/s},\n"; + break; + case metric::perf_tflop_sec: + os << "ylabel={Berformance in tFLOP/s},\n"; + break; + case metric::runtime_sec: + os << "ylabel={Runtime in seconds},\n"; + break; + default: + throw std::runtime_error("Unsupported metric"); + } + + os << "title={" << title << "},\n" + << "]"; + + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << "\\addplot table[x=threads,y=" + << (*it)->get_label() + << "]{\\data" + << (*it)->get_label() + << "};\n"; + + os << "\\legend{"; + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); + os << "}\n" + + << "\\end{semilogyaxis}\n" + + << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " + << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" + << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" + << "G+Smo " << gsSysInfo::getGismoVersion() + << ", Eigen " << gsSysInfo::getEigenVersion() + << " (" << gsSysInfo::getCompilerVersion() + << ", " << gsSysInfo::getCppVersion() + << ", " << gsSysInfo::getStdLibVersion() + << (gsSysInfo::getExtraLibsVersion().empty() + ? "), \n" + : gsSysInfo::getExtraLibsVersion()+"), \n") + + << "CPU " << gsSysInfo::getCpuInfo() << ", " + << "Memory " << gsSysInfo::getMemoryInfo() << ", "; + + gsJITCompilerConfig jit; jit.load("config/jit.xml"); + std::string flags = jit.getFlags(); + os << "Compiler flags "; + + for (auto token=strtok(&flags[0], " "); token!=NULL; token=strtok(NULL, " ")) { + if (token[0]=='-') { + if (token[1]=='I' || token[1]=='L' || token[1]=='l' || token[1]=='W') + continue; + os << "\\verb!" << token << "! "; + } + } + + os << "};\n" + << "\\end{tikzpicture}\n"; + + return os; + } + + std::ostream &gsBenchmark::print(std::ostream &os) const + { + os << "\\documentclass[tikz]{standalone}\n" + << "\\usepackage{pgfplots}\n" + << "\\usepackage{verbatim}\n" + << "\\begin{document}\n" + << "\\usetikzlibrary{calc}\n"; + + for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) + (*it)->print(os); + + os << "\\end{document}\n"; + return os; + } + +} // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h new file mode 100644 index 0000000000..dcf94cce0c --- /dev/null +++ b/src/gsIO/gsBenchmark.h @@ -0,0 +1,217 @@ +/** @file gsBenchmark.h + + @brief Provides a generic benchmarking framework. + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace gismo +{ + +/** + * Benchmark metrics + */ +enum metric { + bandwidth_kb_sec, + bandwidth_mb_sec, + bandwidth_gb_sec, + bandwidth_tb_sec, + perf_kflop_sec, + perf_mflop_sec, + perf_gflop_sec, + perf_tflop_sec, + runtime_sec, +}; + +/** + * Benchmark result + */ +typedef std::array gsBenchmarkResult; + +/** + * Benchmark: driver function + */ + + +/** + * Benchmark class + */ +class gsBenchmark +{ +public: + /** + * Benchmark result set class + */ + class gsBenchmarkResultSet + { + public: + gsBenchmarkResultSet(const std::string& label, + const std::string& title, + const std::vector& results) + : label(label), + title(title), + results(results) + { + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector& get() const + { return results; } + + std::ostream &print(std::ostream &os) const; + + private: + const std::string label, title; + std::vector results; + }; + + /** + * Benchmark set class + */ + class gsBenchmarkSet + { + public: + gsBenchmarkSet(const std::string& label, + const std::string& title) + : id('A'), + label(label), + title(title) + {} + + ~gsBenchmarkSet() + { + for (auto it=results.begin(); it!=results.end(); ++it) + delete (*it); + } + + void add(const std::string& label, + const std::string& title, + const std::vector& results) + { + this->results.emplace_back(new gsBenchmarkResultSet(label+std::string(1,id++), + title, results)); + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector& get() const + { return results; } + + std::ostream &print(std::ostream &os) const; + + private: + char id; + const std::string label,title; + std::vector results; + }; + +public: + ~gsBenchmark() + { + for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) + delete (*it); + } + + gsBenchmarkSet* add(const std::string& label, + const std::string& title) + { + benchmarks.emplace_back(new gsBenchmarkSet(label, title)); + return benchmarks.back(); + } + + const std::vector& get() const + { return benchmarks; } + + std::ostream &print(std::ostream &os) const; + + template + static std::vector + run(const std::vector& nthreads, int nruns, T& benchmark, metric metric) + { + gsStopwatch stopwatch; + std::size_t benchmark_result; + double benchmark_metric, benchmark_runtime; + + std::vector results; + + try { + for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { + + omp_set_num_threads(*it); + benchmark_runtime = 0.0; + benchmark_metric = 0.0; + + for (int run=0; run(*it) /* number of OpenMP threads */, + benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, + benchmark_metric/(double)nruns /* averaged benchmark metric */, + (double)metric /* benchmark metric */ + }); + } + } catch(...) {} + + return results; + } + +private: + std::vector benchmarks; +}; + +/// Print (as string) operator +std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) +{ return obj.print(os); } + +} // namespace gismo From 3c4c249898253c621c3142de40864baf45d43515 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 5 Dec 2021 11:22:28 +0100 Subject: [PATCH 056/174] Included gsBenchmark.h and gsSysInfo.h in gismo.h --- src/gismo.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gismo.h b/src/gismo.h index 454e68fc50..4f460cbbf8 100644 --- a/src/gismo.h +++ b/src/gismo.h @@ -76,6 +76,8 @@ namespace internal #include +#include + // #include // included by gsForwardDeclarations -> gsMemory // Tensors @@ -174,6 +176,7 @@ namespace internal /* ----------- IO ----------- */ #include +#include #include #include #include From 63f03af08fc33ccd452d8a8b21889c6dd8ef961b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 5 Dec 2021 11:23:19 +0100 Subject: [PATCH 057/174] Updated handling of C++ standard --- CMakeLists.txt | 27 +- cmake/AddCXXCompileOptions.cmake | 680 ++++++++++++++++++++++--------- cmake/gsConfig.cmake | 16 +- 3 files changed, 512 insertions(+), 211 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bada0df755..a321f543a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,15 +39,22 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) if(NOT CMAKE_BUILD_TYPE) - # Set default build type to Release - set(CMAKE_BUILD_TYPE Release CACHE STRING - "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) - if(NOT CMAKE_CONFIGURATION_TYPES) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "RelWithDebInfo" "MinSizeRel") - endif() + # Set default build type to Release + set(CMAKE_BUILD_TYPE Release CACHE STRING + "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) + if(NOT CMAKE_CONFIGURATION_TYPES) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" + "RelWithDebInfo" "MinSizeRel") + endif() endif() +if(NOT CMAKE_CXX_STANDARD) + # Set default C++ standard to 11 + set(CMAKE_CXX_STANDARD 11 CACHE STRING + "C++ standard (11 14 17 20)" FORCE) + set_property(CACHE CMAKE_CXX_STANDARD PROPERTY STRINGS "11" "14" "17" "20") +endif() + set(gismo_VERSION_MAJOR 21) #year set(gismo_VERSION_MINOR 12) #month set(gismo_VERSION_PATCH 0 ) #patch @@ -59,9 +66,9 @@ if(CMAKE_QUIET) function(message) list(GET ARGV 0 MessageType) if(MessageType STREQUAL FATAL_ERROR OR - MessageType STREQUAL SEND_ERROR OR - MessageType STREQUAL WARNING OR - MessageType STREQUAL AUTHOR_WARNING) + MessageType STREQUAL SEND_ERROR OR + MessageType STREQUAL WARNING OR + MessageType STREQUAL AUTHOR_WARNING) list(REMOVE_AT ARGV 0) _message(${MessageType} "${ARGV}") endif() diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index f152b176e9..51c5aa21de 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -1,188 +1,492 @@ -###################################################################### -## AddCXXConpileOptions.cmake -## This file is part of the G+Smo library. -## -## Authors: M. Moeller and A. Mantzaflaris -###################################################################### - -set(CMAKE_CXX_STANDARD_DEFAULT 14) - -if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI") - - # CMake does not yet provide flags for the Portland Group compiler - - # The Portland Group - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "$std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "$std=c++98") - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - else() - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - -endif() - -if (CMAKE_VERSION VERSION_LESS "3.1") - -if ((CMAKE_SYSTEM_NAME STREQUAL "Darwin") AND (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")) - - #also: -stdlib=libc++ - - # Apple Clang - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) - # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro greater than 201103L - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - - # LLVM Clang - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.1) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - # .. additionally requires gnu libstdc++ greater than 4.6 - # set(CMAKE_CXX_STANDARD_DEFAULT 14) - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - - # GNU Compiler Collection - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.3) - # Flag supported since 4.3 - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4) - # 4.3 supports 0x variants, but compliance is very low - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - -elseif ( "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntel") - - # Intel compiler - if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - set(_std -Qstd) - set(_ext c++) - else() - set(_std -std) - set(_ext gnu++) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.1) - set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "${_std}=c++98") - set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "${_std}=${_ext}98") - set(CMAKE_CXX_STANDARD_DEFAULT 98) - endif() - - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}11") - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++14") - # todo: there is no gnu++14 value supported; figure out what to do - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++14") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++0x") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}0x") - set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++1y") - # todo: there is no gnu++14 value supported; figure out what to do - set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++1y") - set(CMAKE_CXX_STANDARD_DEFAULT 14) - endif() - - unset(_std) - unset(_ext) - -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro") - - # Oracle Solaris Studio - if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.13) - set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") - set(CMAKE_CXX_STANDARD_DEFAULT 11) - endif() - -endif() - -endif() # cmake 3.1 - -if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") -endif() - -# Apply for Cmake less than 3.1 -if (CMAKE_VERSION VERSION_LESS "3.1") - - if ( NOT "x${CMAKE_CXX_STANDARD}" STREQUAL "x98" AND - ${CMAKE_CXX_STANDARD_DEFAULT} LESS ${CMAKE_CXX_STANDARD}) - #message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT} (requested: ${CMAKE_CXX_STANDARD}).") - message(STATUS "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT}, CXX_STANDARD choice is changed.") - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") - endif() - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION}") -endif()#cmake<3.1 - - -# Bugfix for windows/msvc systems -if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") -endif() +###################################################################### +## AddCXXConpileOptions.cmake +## This file is part of the G+Smo library. +## +## Authors: M. Moeller and A. Mantzaflaris +###################################################################### + +# We strongly recommend to use an up-to-date cmake version which +# provides support for the most recent compiler version. We provide a +# subset of compiler options copied from cmake 3.17.5. +# +# The options below are only used if +# CMAKE_CXXvv_STANDARD_COMPILE_OPTIONS and +# CMAKE_CXXvv_EXTENSION_COMPILE_OPTIONS are not yet set by the regular +# cmake routines, where vv is the value of CMAKE_CXX_STANDARD. + +if(NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR + NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) + + message(WARNING "Update your CMake installation! We fall back to + compiler options back ported from CMake 3.17.5") + + if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") + + # AppleClang + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro greater than 201103L + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xARMClang" OR + "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" OR + "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xFujitsuClang") + + # ARMClang/Clang + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.1) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + # The MSVC standard library requires C++14, and MSVC itself has no + # notion of operating in a mode not aware of at least that standard. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++14") + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++14") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "Cray") + + # Cray + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -h conform) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION -h gnu) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.4) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION -h std=c++11) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION -h std=c++11,gnu) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.6) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION -h std=c++14) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION -h std=c++14,gnu) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xFujitsu") + + # Configuration taken from CMake 3.22.0 + if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 4) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -std=c++03) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION -std=gnu++03) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION -std=c++11) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION -std=gnu++11) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION -std=c++14) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION -std=gnu++14) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION -std=c++17) + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION -std=gnu++17) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + + # GNU + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.4) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4) + # 4.3 supports 0x variants + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.1) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + elseif (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++1z") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++1z") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 11.1) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++20") + set(CMAKE_CXX23_STANDARD_COMPILE_OPTION "-std=c++23") + set(CMAKE_CXX23_EXTENSION_COMPILE_OPTION "-std=gnu++23") + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntel") + + # Intel classical + + if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + + set(CMAKE_CXX_CLANG_TIDY_DRIVER_MODE "cl") + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-Qstd=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-Qstd=c++17") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-Qstd=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-Qstd=c++14") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + else() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0.0) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.0) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + endif() + + # Intel 15.0.2 accepts c++14 instead of c++1y, but not gnu++14 + # instead of gnu++1y. Intel 17.0.0 accepts gnu++14 too. + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 17.0) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++0x") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + endif() + + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") + + # Intel Clang-based + + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + + if(NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std=gnu++17") + + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++20") + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-Qstd=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-Qstd=c++11") + + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-Qstd=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-Qstd=c++14") + + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-Qstd=c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-Qstd=c++17") + + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-Qstd=c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-Qstd=c++20") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") + + if((CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.0.24215.1 AND + CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.10) OR + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.10.25017) + + # VS 2015 Update 3 and above support language standard level flags, + # with the default and minimum level being C++14. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std:c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std:c++14") + + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.11.25505) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std:c++17") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std:c++17") + else() + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "-std:c++latest") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.29.30129) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std:c++20") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std:c++20") + set(CMAKE_CXX23_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX23_EXTENSION_COMPILE_OPTION "-std:c++latest") + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 19.12.25835) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std:c++latest") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std:c++latest") + endif() + + elseif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0) + # MSVC has no specific options to set language standards, but set them as + # empty strings anyways so the feature test infrastructure can at least check + # to see if they are defined. + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "") + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xPGI" OR + "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xNVHPC") + + # PGI/NVHPCV + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.10) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION -A) + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION --gnu_extensions) + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.10) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION --c++11 -A) + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION --c++11 --gnu_extensions) + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.7) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14 -A) + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION --c++14 --gnu_extensions) + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 17.1) + set(CMAKE_CXX17_STANDARD_COMPILE_OPTION --c++17 -A) + set(CMAKE_CXX17_EXTENSION_COMPILE_OPTION --c++17 --gnu_extensions) + set(CMAKE_CXX17_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif() + endif() + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 20.11) + set(CMAKE_CXX20_STANDARD_COMPILE_OPTION -std=c++20) + set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION -std=gnu++20) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xSunPro") + + # Oracle Solaris Studio + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.13) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++03") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=c++03") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX_LINK_WITH_STANDARD_COMPILE_OPTION 1) + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.14) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=c++14") + endif() + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-library=stlport4") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-library=stlport4") + set(CMAKE_CXX_LINK_WITH_STANDARD_COMPILE_OPTION 1) + endif() + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xXL") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.1) + if(CMAKE_SYSTEM MATCHES "Linux") + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + else() + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-qlanglvl=strict98") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-qlanglvl=extended") + endif() + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + + # XL does not really have full C++11 or C++14 support, but since we do not + # have a granular XL-CXX-FeatureTests table for it just pretend it does. + # This way projects that specify granular features will at least get a + # compiler mode for the corresponding standard. + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.1.0 AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-qlanglvl=extended1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-qlanglvl=extended1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif() + endif () + + elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xXLClang") + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1.1) + set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX98_STANDARD__HAS_FULL_SUPPORT ON) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-qlanglvl=extended0x") + set(CMAKE_CXX11_STANDARD__HAS_FULL_SUPPORT ON) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.1.2) + set(CMAKE_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(CMAKE_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + set(CMAKE_CXX14_STANDARD__HAS_FULL_SUPPORT ON) + endif () + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.1.0) + set(CMAKE_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(CMAKE_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + endif() + endif() + + else() + + message(FATAL_ERROR "Unsupported compiler ${CMAKE_CXX_COMPILER_ID}") + + endif() + +endif() + +if (NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR + NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) + message(FATAL_ERROR "Unsupported C++ standard") +endif() diff --git a/cmake/gsConfig.cmake b/cmake/gsConfig.cmake index 9350926170..85b9c3c697 100644 --- a/cmake/gsConfig.cmake +++ b/cmake/gsConfig.cmake @@ -55,15 +55,6 @@ if(NOT GISMO_INDEX_TYPE) "int" "int32_t" "int64_t" "long" "long long" ) endif() -# Set a default build type if none was specified -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING - "Type of build (None Debug Release RelWithDebInfo MinSizeRel)" FORCE) - # Set the possible values of build type for cmake-gui - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "RelWithDebInfo" "MinSizeRel") -endif() - set(${PROJECT_NAME}_ARCHIVE_OUTPUT_DIRECTORY lib) set(${PROJECT_NAME}_RUNTIME_OUTPUT_DIRECTORY bin) set(${PROJECT_NAME}_LIBRARY_OUTPUT_DIRECTORY lib) @@ -103,7 +94,7 @@ if(GISMO_BUILD_COVERAGE AND CMAKE_COMPILER_IS_GNUCXX) APPEND_COVERAGE_COMPILER_FLAGS() #set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ftest-coverage -fprofile-arcs") #set(CMAKE_EXE_LINKER_FLAGS "-fprofile-arcs -ftest-coverage") -endif(GISMO_BUILD_COVERAGE AND CMAKE_COMPILER_IS_GNUCXX) +endif() if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") @@ -139,7 +130,7 @@ endif() if(GISMO_EXTRA_DEBUG) include(gsDebugExtra) -endif(GISMO_EXTRA_DEBUG) +endif() if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xMSVC") # Force to always compile with W4 @@ -249,11 +240,10 @@ endif() #message(STATUS "Using compilation flags: ${CMAKE_CXX_FLAGS}, ${CMAKE_CXX_FLAGS_${TEMP}}") if("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease") - #https://github.com/VcDevel/Vc/blob/master/cmake/OptimizeForArchitecture.cmake include( OptimizeForArchitecture ) OptimizeForArchitecture() foreach (flag ${OFA_ARCHITECTURE_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}") endforeach() -endif("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease") +endif() From 84406e6a4a32d24095be64736bc20c8fef93f310 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Sun, 5 Dec 2021 11:23:37 +0100 Subject: [PATCH 058/174] Migrated benchmark application to new gsBenchmark class --- examples/performance_benchmark.cpp | 387 +++-------------------------- 1 file changed, 39 insertions(+), 348 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 6fbb0cecd1..b2c339b2e7 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -314,319 +314,10 @@ class benchmark_eigen_vector_dense_matmul }; //! [Implement benchmarks] -//! [Implement benchmark infrastructure] - -/** - * Benchmark metrics - */ -enum class benchmark_metric { - bandwidth_kb_sec, - bandwidth_mb_sec, - bandwidth_gb_sec, - bandwidth_tb_sec, - perf_kflop_sec, - perf_mflop_sec, - perf_gflop_sec, - perf_tflop_sec, - runtime_sec, -}; - -/** - * Benchmark: driver function - */ -template -std::vector< std::array > -benchmark_driver(const std::vector& nthreads, int nruns, T& benchmark, benchmark_metric metric) -{ - gsStopwatch stopwatch; - std::size_t benchmark_result; - double benchmark_metric, benchmark_runtime; - - std::vector< std::array > results; - - try { - for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { - - omp_set_num_threads(*it); - benchmark_runtime = 0.0; - benchmark_metric = 0.0; - - for (int run=0; run(*it) /* number of OpenMP threads */, - benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, - benchmark_metric/(double)nruns /* averaged benchmark metric */, - (double)metric} /* benchmark metric */ ); - } - } catch(...) {} - - return results; -} - -/** - * Benchmark LaTeX output - */ -class benchmark_latex -{ -public: - /** - * Result set class - */ - class result_set - { - public: - result_set(const std::string& label, - const std::string& title, - const std::vector< std::array >& results) - : label(label), - title(title), - results(results) - { - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector< std::array >& get_results() const - { return results; } - - std::ostream &print(std::ostream &os) const - { - os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" - << "threads & " << label << " \\\\\n"; - - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; - - os << "}\\data" << label << "\n"; - - return os; - } - - private: - const std::string label, title; - std::vector< std::array > results; - }; - - /** - * Benchmark set class - */ - class benchmark_set - { - public: - benchmark_set(const std::string& label, - const std::string& title) - : id('A'), - label(label), - title(title) - {} - - ~benchmark_set() - { - for (auto it=results.begin(); it!=results.end(); ++it) - delete (*it); - } - - void add_results(const std::string& label, - const std::string& title, - const std::vector< std::array >& results) - { - this->results.emplace_back(new result_set(label+std::string(1,id++), title, results)); - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector& get_results() const - { return results; } - - std::ostream &print(std::ostream &os) const - { - for (auto it=results.cbegin(); it!=results.cend(); ++it) - (*it)->print(os); - - os << "\\begin{tikzpicture}\n" - << "\\begin{semilogyaxis}[\n" - << "name=MyAxis,\n" - << "width=\\textwidth,\n" - << "height=.5\\textwidth,\n" - << "legend pos=outer north east,\n" - - << "symbolic x coords={"; - - for (auto it=(*results.cbegin())->get_results().cbegin(); - it!=(*results.cbegin())->get_results().cend(); ++it) - os << (*it)[0] << (it!=(*results.cbegin())->get_results().cend()-1 ? "," : ""); - os << "},\n" - - << "xlabel={OpenMP threads},\n"; - - switch((benchmark_metric)(*(*results.cbegin())->get_results().cbegin())[4]) { - case benchmark_metric::bandwidth_kb_sec: - os << "ylabel={Bandwidth in KB/s},\n"; - break; - case benchmark_metric::bandwidth_mb_sec: - os << "ylabel={Bandwidth in MB/s},\n"; - break; - case benchmark_metric::bandwidth_gb_sec: - os << "ylabel={Bandwidth in GB/s},\n"; - break; - case benchmark_metric::bandwidth_tb_sec: - os << "ylabel={Bandwidth in TB/s},\n"; - break; - case benchmark_metric::perf_kflop_sec: - os << "ylabel={Berformance in kFLOP/s},\n"; - break; - case benchmark_metric::perf_mflop_sec: - os << "ylabel={Berformance in mFLOP/s},\n"; - break; - case benchmark_metric::perf_gflop_sec: - os << "ylabel={Berformance in gFLOP/s},\n"; - break; - case benchmark_metric::perf_tflop_sec: - os << "ylabel={Berformance in tFLOP/s},\n"; - break; - case benchmark_metric::runtime_sec: - os << "ylabel={Runtime in seconds},\n"; - break; - default: - throw std::runtime_error("Unsupported metric"); - } - - os << "title={" << title << "},\n" - << "]"; - - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << "\\addplot table[x=threads,y=" - << (*it)->get_label() - << "]{\\data" - << (*it)->get_label() - << "};\n"; - - os << "\\legend{"; - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); - os << "}\n" - - << "\\end{semilogyaxis}\n" - - << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " - << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" - << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" - << "G+Smo " << gsCmdLine::getGismoVersion() - << ", Eigen " << gsCmdLine::getEigenVersion() - << " (" << gsCmdLine::getCompilerVersion() - << ", " << gsCmdLine::getCppVersion() - << ", " << gsCmdLine::getStdLibVersion() - << (gsCmdLine::getExtraLibsVersion().empty() - ? "), \n" - : gsCmdLine::getExtraLibsVersion()+"), \n") - - << "CPU " << gsCmdLine::getCpuInfo() << ", " - << "Memory " << gsCmdLine::getMemoryInfo() << ", "; - - gsJITCompilerConfig jit; jit.load("config/jit.xml"); - std::string flags = jit.getFlags(); - os << "Compiler flags "; - - for (auto token=strtok(&flags[0], " "); token!=NULL; token=strtok(NULL, " ")) { - if (token[0]=='-') { - if (token[1]=='I' || token[1]=='L' || token[1]=='l' || token[1]=='W') - continue; - os << "\\verb!" << token << "! "; - } - } - - os << "};\n" - << "\\end{tikzpicture}\n"; - - return os; - } - - private: - char id; - const std::string label,title; - std::vector< result_set* > results; - }; - -public: - ~benchmark_latex() - { - for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) - delete (*it); - } - - benchmark_set* add_benchmark(const std::string& label, - const std::string& title) - { - benchmarks.emplace_back(new benchmark_set(label, title)); - return benchmarks.back(); - } - - const std::vector< benchmark_set* >& get_benchmarks() const - { return benchmarks; } - - std::ostream &print(std::ostream &os) const - { - os << "\\documentclass[tikz]{standalone}\n" - << "\\usepackage{pgfplots}\n" - << "\\usepackage{verbatim}\n" - << "\\begin{document}\n" - << "\\usetikzlibrary{calc}\n"; - - for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) - (*it)->print(os); - - os << "\\end{document}\n"; - return os; - } - -private: - std::vector< benchmark_set* > benchmarks; -}; - -/// Print (as string) operator -std::ostream &operator<<(std::ostream &os, const benchmark_latex& obj) -{ return obj.print(os); } -//! [Implement benchmark infrastructure] - - int main(int argc, char *argv[]) { //! [Parse command line] - benchmark_latex latex; + gsBenchmark benchmark; std::vector nthreads, ssizes, dsizes, vsizes; std::string fn; int nruns=1; @@ -682,17 +373,17 @@ int main(int argc, char *argv[]) //! [Execute benchmarks] { - auto bm = latex.add_benchmark("memcopy", "memory copy"); + auto bm = benchmark.add("memcopy", "memory copy"); { gsInfo << "=== Native C array memcopy\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_c_array_memcopy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("nativememcopy", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("nativememcopy", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } @@ -703,27 +394,27 @@ int main(int argc, char *argv[]) gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_eigen_vector_memcopy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("eigenmemcopy", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("eigenmemcopy", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("dotprod", "dot-product"); + auto bm = benchmark.add("dotprod", "dot-product"); { gsInfo << "=== Native C array dot-product\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_c_array_dotproduct benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("nativedotproduct", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("nativedotproduct", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } @@ -734,27 +425,27 @@ int main(int argc, char *argv[]) gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_eigen_vector_dotproduct benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("eigendotproduct", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("eigendotproduct", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("axpy", "axpy"); + auto bm = benchmark.add("axpy", "axpy"); { gsInfo << "=== Native C array AXPY\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_c_array_axpy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("nativeaxpy", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("nativeaxpy", + "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } @@ -765,27 +456,27 @@ int main(int argc, char *argv[]) gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_eigen_vector_axpy benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("eigenaxpy", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("eigenaxpy", + "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } } { - auto bm = latex.add_benchmark("densemvmul", "Dense matrix-vector multiply"); + auto bm = benchmark.add("densemvmul", "Dense matrix-vector multiply"); { gsInfo << "=== Native C array dense matrix-vector multiplication\n"; for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_c_array_dense_matmul benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("nativdensemvmul", - "native("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("nativdensemvmul", + "native("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } @@ -796,21 +487,21 @@ int main(int argc, char *argv[]) gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; try { benchmark_eigen_vector_dense_matmul benchmark(*it); - auto results = benchmark_driver(nthreads, nruns, benchmark, benchmark_metric::bandwidth_gb_sec); - bm->add_results("eigenmvmul", - "eigen("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", - results); + auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); + bm->add("eigenmvmul", + "eigen("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", + results); } catch(...) { gsInfo << "failed!"; } } } } if (fn.empty()) - gsInfo << latex << "\n"; + gsInfo << benchmark << "\n"; else { std::ofstream file; file.open(fn); - file << latex << "\n"; + file << benchmark << "\n"; file.close(); } //! [Execute benchmarks] From ea4ddf831db9f79cd5aa9b70d9b608f66a96337f Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 14:09:04 +0100 Subject: [PATCH 059/174] fix compilation, moved gsOpenMp to the gsParallel folder --- src/gismo.h | 4 +- src/gsCore/gsOpenMP.cpp | 447 ------------------ src/gsCore/gsOpenMP.h | 201 -------- src/gsCore/gsSysInfo.cpp | 15 + src/gsCore/gsSysInfo.h | 18 +- src/gsIO/gsBenchmark.cpp | 1 + src/gsIO/gsBenchmark.h | 5 +- src/gsIO/gsCmdLine.h | 2 +- src/{gsMpi => gsParallel}/gsBinaryFunctions.h | 0 src/{gsMpi => gsParallel}/gsMpi.cpp | 2 +- src/{gsMpi => gsParallel}/gsMpi.h | 6 +- src/{gsMpi => gsParallel}/gsMpiComm.h | 0 src/{gsMpi => gsParallel}/gsMpiTraits.h | 0 src/gsParallel/gsOpenMP.cpp | 440 +++++++++++++++++ src/gsParallel/gsOpenMP.h | 197 ++++++++ src/gsUtils/gsUtils.h | 2 +- 16 files changed, 665 insertions(+), 675 deletions(-) delete mode 100644 src/gsCore/gsOpenMP.cpp delete mode 100644 src/gsCore/gsOpenMP.h rename src/{gsMpi => gsParallel}/gsBinaryFunctions.h (100%) rename src/{gsMpi => gsParallel}/gsMpi.cpp (90%) rename src/{gsMpi => gsParallel}/gsMpi.h (98%) rename src/{gsMpi => gsParallel}/gsMpiComm.h (100%) rename src/{gsMpi => gsParallel}/gsMpiTraits.h (100%) create mode 100644 src/gsParallel/gsOpenMP.cpp create mode 100644 src/gsParallel/gsOpenMP.h diff --git a/src/gismo.h b/src/gismo.h index 4f460cbbf8..33c3a6a9bc 100644 --- a/src/gismo.h +++ b/src/gismo.h @@ -186,8 +186,8 @@ namespace internal #include #include -/* ----------- MPI ----------- */ -#include +/* ----------- Parallel ----------- */ +#include /* ----------- Utilities ----------- */ //#include - in gsForwardDeclarations.h diff --git a/src/gsCore/gsOpenMP.cpp b/src/gsCore/gsOpenMP.cpp deleted file mode 100644 index e06641668f..0000000000 --- a/src/gsCore/gsOpenMP.cpp +++ /dev/null @@ -1,447 +0,0 @@ -/** @file gsOpenMP.cpp - - @brief Implementation of OpenMP stub routines to be used when libomp is not available - - This file is part of the G+Smo library. - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. - - Author(s): M. Moller -*/ - -#if !defined(_OPENMP) - -#include - -void omp_set_num_threads(int num_threads) -{} - -int omp_get_num_threads(void) -{ - return 1; -} - -int omp_get_max_threads(void) -{ - return 1; -} - -int omp_get_thread_num(void) -{ - return 0; -} - -int omp_get_num_procs(void) -{ - return 1; -} - -int omp_in_parallel(void) -{ - return 0; -} - -void omp_set_dynamic(int dynamic_threads) -{} - -int omp_get_dynamic(void) -{ - return 0; -} - -int omp_get_cancellation(void) -{ - return 0; -} - -void omp_set_nested(int nested) -{} - -int omp_get_nested(void) -{ - return 0; -} - -void omp_set_schedule(omp_sched_t kind, int chunk_size) -{} - -void omp_get_schedule(omp_sched_t *kind, int *chunk_size) -{ - *kind = omp_sched_static; - *chunk_size = 0; -} - -int omp_get_thread_limit(void) -{ - return 1; -} - -void omp_set_max_active_levels(int max_active_levels) -{} - -int omp_get_max_active_levels(void) -{ - return 0; -} - -int omp_get_level(void) -{ - return 0; -} - -int omp_get_ancestor_thread_num(int level) -{ - if (level == 0) - { - return 0; - } - else - { - return -1; - } -} - -int omp_get_team_size(int level) -{ - if (level == 0) - { - return 1; - } - else - { - return -1; - } -} - -int omp_get_active_level(void) -{ - return 0; -} - -int omp_in_final(void) -{ - return 1; -} - -omp_proc_bind_t omp_get_proc_bind(void) -{ - return omp_proc_bind_false; -} - -int omp_get_num_places(void) -{ - return 0; -} - -int omp_get_place_num_procs(int place_num) -{ - return 0; -} - -void omp_get_place_proc_ids(int place_num, int *ids) -{} - -int omp_get_place_num(void) -{ - return -1; -} - -int omp_get_partition_num_places(void) -{ - return 0; -} - -void omp_get_partition_place_nums(int *place_nums) -{} - -void omp_set_default_device(int device_num) -{} - -int omp_get_default_device(void) -{ - return 0; -} - -int omp_get_num_devices(void) -{ - return 0; -} - -int omp_get_num_teams(void) -{ - return 1; -} - -int omp_get_team_num(void) -{ - return 0; -} - -int omp_is_initial_device(void) -{ - return 1; -} - -int omp_get_initial_device(void) -{ - return -10; -} - -int omp_get_max_task_priority(void) -{ - return 0; -} - -void omp_init_lock(omp_lock_t *arg) -{ - arg->lock = UNLOCKED; -} - -void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) -{ - omp_init_lock(arg); -} - -void omp_destroy_lock(omp_lock_t *arg) -{ - arg->lock = INIT; -} - -void omp_set_lock(omp_lock_t *arg) -{ - if (arg->lock == UNLOCKED) - { - arg->lock = LOCKED; - } - else if (arg->lock == LOCKED) - { - fprintf(stderr, "error: deadlock in using lock variable\n"); - exit(1); - } - else - { - exit(1); - } -} - -void omp_unset_lock(omp_lock_t *arg) -{ - if (arg->lock == LOCKED) - { - arg->lock = UNLOCKED; - } - else if (arg->lock == UNLOCKED) - { - fprintf(stderr, "error: lock not set\n"); - exit(1); - } - else - { - fprintf(stderr, "error: lock not initialized\n"); - exit(1); - } -} - -int omp_test_lock(omp_lock_t *arg) -{ - if (arg->lock == UNLOCKED) - { - arg->lock = LOCKED; - return 1; - } - else if (arg->lock == LOCKED) - { - return 0; - } - else { - fprintf(stderr, "error: lock not initialized\n"); - exit(1); - } -} - -void omp_init_nest_lock(omp_nest_lock_t *arg) -{ - arg->owner = NOOWNER; - arg->count = 0; -} - -void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, - omp_lock_hint_t hint) -{ - omp_init_nest_lock(arg); -} - -void omp_destroy_nest_lock(omp_nest_lock_t *arg) -{ - arg->owner = NOOWNER; - arg->count = UNLOCKED; -} - -void omp_set_nest_lock(omp_nest_lock_t *arg) -{ - if (arg->owner == MASTER && arg->count >= 1) - { - arg->count++; - } - else if (arg->owner == NOOWNER && arg->count == 0) - { - arg->owner = MASTER; - arg->count = 1; - } - else - { - fprintf(stderr, "error: lock corrupted or not initialized\n"); - exit(1); - } -} - -void omp_unset_nest_lock(omp_nest_lock_t *arg) -{ - if (arg->owner == MASTER && arg->count >= 1) - { - arg->count--; - if (arg->count == 0) - { - arg->owner = NOOWNER; - } - } - else if (arg->owner == NOOWNER && arg->count == 0) - { - fprintf(stderr, "error: lock not set\n"); - exit(1); - } - else - { - fprintf(stderr, "error: lock corrupted or not initialized\n"); - exit(1); - } -} - -int omp_test_nest_lock(omp_nest_lock_t *arg) -{ - omp_set_nest_lock(arg); - return arg->count; -} - -double omp_get_wtime(void) -{ - /* This function does not provide a working - * wallclock timer. Replace it with a version - * customized for the target machine. - */ - return 0.0; -} - -double omp_get_wtick(void) -{ - /* This function does not provide a working - * clock tick function. Replace it with - * a version customized for the target machine. - */ - return 365. * 86400.; -} - -void * omp_target_alloc(size_t size, int device_num) -{ - if (device_num != -10) - return NULL; - return malloc(size); -} - -void omp_target_free(void *device_ptr, int device_num) -{ - free(device_ptr); -} - -int omp_target_is_present(void *ptr, int device_num) -{ - return 1; -} - -int omp_target_memcpy(void *dst, void *src, size_t length, - size_t dst_offset, size_t src_offset, - int dst_device, int src_device) -{ - // only the default device is valid in a stub - if (dst_device != -10 || src_device != -10 - || ! dst || ! src ) - return EINVAL; - memcpy((char *)dst + dst_offset, - (char *)src + src_offset, - length); - return 0; -} - -int omp_target_memcpy_rect(void *dst, void *src, - size_t element_size, - int num_dims, - const size_t *volume, - const size_t *dst_offsets, - const size_t *src_offsets, - const size_t *dst_dimensions, - const size_t *src_dimensions, - int dst_device_num, int src_device_num) -{ - int ret=0; - // Both null, return number of dimensions supported, - // this stub supports an arbitrary number - if (dst == NULL && src == NULL) return INT_MAX; - - if (!volume || !dst_offsets || !src_offsets - || !dst_dimensions || !src_dimensions - || num_dims < 1 ) { - ret = EINVAL; - goto done; - } - if (num_dims == 1) { - ret = omp_target_memcpy(dst, src, - element_size * volume[0], - dst_offsets[0] * element_size, - src_offsets[0] * element_size, - dst_device_num, src_device_num); - if(ret) goto done; - } else { - size_t dst_slice_size = element_size; - size_t src_slice_size = element_size; - for (int i=1; i < num_dims; i++) { - dst_slice_size *= dst_dimensions[i]; - src_slice_size *= src_dimensions[i]; - } - size_t dst_off = dst_offsets[0] * dst_slice_size; - size_t src_off = src_offsets[0] * src_slice_size; - for (size_t i=0; i < volume[0]; i++) { - ret = omp_target_memcpy_rect( - (char *)dst + dst_off + dst_slice_size*i, - (char *)src + src_off + src_slice_size*i, - element_size, - num_dims - 1, - volume + 1, - dst_offsets + 1, - src_offsets + 1, - dst_dimensions + 1, - src_dimensions + 1, - dst_device_num, - src_device_num); - if (ret) goto done; - } - } - done: - return ret; -} - -int omp_target_associate_ptr(void *host_ptr, void *device_ptr, - size_t size, size_t device_offset, - int device_num) -{ - // No association is possible because all host pointers - // are considered present - return EINVAL; -} - -int omp_target_disassociate_ptr(void *ptr, int device_num) -{ - return EINVAL; -} -#endif // !defined(_OPENMP) diff --git a/src/gsCore/gsOpenMP.h b/src/gsCore/gsOpenMP.h deleted file mode 100644 index cccedeb338..0000000000 --- a/src/gsCore/gsOpenMP.h +++ /dev/null @@ -1,201 +0,0 @@ -/** @file gsOpenMP.h - - @brief OpenMP stub routines to be used when omp.h is not available - - This file is part of the G+Smo library. - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. - - Author(s): M. Moller -*/ - -#pragma once - -#ifdef _OPENMP - -#include - -#else - -#include -#include -#include -#include -#include - -void omp_set_num_threads(int num_threads); - -int omp_get_num_threads(void); - -int omp_get_max_threads(void); - -int omp_get_thread_num(void); - -int omp_get_num_procs(void); - -int omp_in_parallel(void); - -void omp_set_dynamic(int dynamic_threads); - -int omp_get_dynamic(void); - -int omp_get_cancellation(void); - -void omp_set_nested(int nested); - -int omp_get_nested(void); - -typedef enum omp_sched_t { - omp_sched_static = 1, - omp_sched_dynamic = 2, - omp_sched_guided = 3, - omp_sched_auto = 4, - omp_sched_monotonic = 0x80000000 -} omp_sched_t; - -void omp_set_schedule(omp_sched_t kind, int chunk_size); - -void omp_get_schedule(omp_sched_t *kind, int *chunk_size); - -int omp_get_thread_limit(void); - -void omp_set_max_active_levels(int max_active_levels); - -int omp_get_max_active_levels(void); - -int omp_get_level(void); - -int omp_get_ancestor_thread_num(int level); - -int omp_get_team_size(int level); - -int omp_get_active_level(void); - -int omp_in_final(void); - -typedef enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -} omp_proc_bind_t; - -omp_proc_bind_t omp_get_proc_bind(void); - -int omp_get_num_places(void); - -int omp_get_place_num_procs(int place_num); - -void omp_get_place_proc_ids(int place_num, int *ids); - -int omp_get_place_num(void); - -int omp_get_partition_num_places(void); - -void omp_get_partition_place_nums(int *place_nums); - -void omp_set_default_device(int device_num); - -int omp_get_default_device(void); - -int omp_get_num_devices(void); - -int omp_get_num_teams(void); - -int omp_get_team_num(void); - -int omp_is_initial_device(void); - -int omp_get_initial_device(void); - -int omp_get_max_task_priority(void); - -typedef struct omp_lock_t { - int lock; -} omp_lock_t; - -enum { UNLOCKED = -1, INIT, LOCKED }; - -void omp_init_lock(omp_lock_t *arg); - -typedef enum omp_sync_hint_t { - omp_sync_hint_none = 0, - omp_lock_hint_none = omp_sync_hint_none, - omp_sync_hint_uncontended = 1, - omp_lock_hint_uncontended = omp_sync_hint_uncontended, - omp_sync_hint_contended = (1<<1), - omp_lock_hint_contended = omp_sync_hint_contended, - omp_sync_hint_nonspeculative = (1<<2), - omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative, - omp_sync_hint_speculative = (1<<3), - omp_lock_hint_speculative = omp_sync_hint_speculative, - kmp_lock_hint_hle = (1<<16), - kmp_lock_hint_rtm = (1<<17), - kmp_lock_hint_adaptive = (1<<18) -} omp_sync_hint_t; - -typedef omp_sync_hint_t omp_lock_hint_t; - -void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint); - -void omp_destroy_lock(omp_lock_t *arg); - -void omp_set_lock(omp_lock_t *arg); - -void omp_unset_lock(omp_lock_t *arg); - -int omp_test_lock(omp_lock_t *arg); - -typedef struct omp_nest_lock_t { - int owner; - int count; -} omp_nest_lock_t; - -enum { NOOWNER = -1, MASTER = 0 }; - -void omp_init_nest_lock(omp_nest_lock_t *arg); - -void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, - omp_lock_hint_t hint); - -void omp_destroy_nest_lock(omp_nest_lock_t *arg); - -void omp_set_nest_lock(omp_nest_lock_t *arg); - -void omp_unset_nest_lock(omp_nest_lock_t *arg); - -int omp_test_nest_lock(omp_nest_lock_t *arg); - -double omp_get_wtime(void); - -double omp_get_wtick(void); - -void * omp_target_alloc(size_t size, int device_num); - -void omp_target_free(void *device_ptr, int device_num); - -int omp_target_is_present(void *ptr, int device_num); - -int omp_target_memcpy(void *dst, void *src, size_t length, - size_t dst_offset, size_t src_offset, - int dst_device, int src_device); - -int omp_target_memcpy_rect(void *dst, void *src, - size_t element_size, - int num_dims, - const size_t *volume, - const size_t *dst_offsets, - const size_t *src_offsets, - const size_t *dst_dimensions, - const size_t *src_dimensions, - int dst_device_num, int src_device_num); - -int omp_target_associate_ptr(void *host_ptr, void *device_ptr, - size_t size, size_t device_offset, - int device_num); - -int omp_target_disassociate_ptr(void *ptr, int device_num); -#endif // _OPENMP diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index 5bd2384459..dcdfc92939 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -12,6 +12,21 @@ */ #include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) +# include +# include +# endif +#elif __unix__ +#endif namespace gismo { diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h index 68a33b6bd7..b2ac9366ab 100644 --- a/src/gsCore/gsSysInfo.h +++ b/src/gsCore/gsSysInfo.h @@ -13,21 +13,9 @@ #pragma once -#include -#include - -#if defined(_WIN32) || defined(_WIN64) -# include -#elif __APPLE__ -# include -# include -#elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) -# include -# include -# endif -#elif __unix__ -#endif +#include + +#include namespace gismo { diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 45007d6a8a..251216e883 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -12,6 +12,7 @@ */ #include +#include namespace gismo { diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index dcf94cce0c..8f58d135c7 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -18,9 +18,6 @@ #include #include -#include -#include - namespace gismo { @@ -52,7 +49,7 @@ typedef std::array gsBenchmarkResult; /** * Benchmark class */ -class gsBenchmark +class GISMO_EXPORT gsBenchmark { public: /** diff --git a/src/gsIO/gsCmdLine.h b/src/gsIO/gsCmdLine.h index 051328b8df..a8cde98054 100644 --- a/src/gsIO/gsCmdLine.h +++ b/src/gsIO/gsCmdLine.h @@ -246,7 +246,7 @@ class GISMO_EXPORT gsCmdLine /// Prints the version information static void printVersion(); - + /// Returns the program's description (as specified in the constructor) std::string& getMessage(); diff --git a/src/gsMpi/gsBinaryFunctions.h b/src/gsParallel/gsBinaryFunctions.h similarity index 100% rename from src/gsMpi/gsBinaryFunctions.h rename to src/gsParallel/gsBinaryFunctions.h diff --git a/src/gsMpi/gsMpi.cpp b/src/gsParallel/gsMpi.cpp similarity index 90% rename from src/gsMpi/gsMpi.cpp rename to src/gsParallel/gsMpi.cpp index 3862926e50..f3d635e1ed 100644 --- a/src/gsMpi/gsMpi.cpp +++ b/src/gsParallel/gsMpi.cpp @@ -1,6 +1,6 @@ -#include +#include namespace gismo { diff --git a/src/gsMpi/gsMpi.h b/src/gsParallel/gsMpi.h similarity index 98% rename from src/gsMpi/gsMpi.h rename to src/gsParallel/gsMpi.h index 029fc74ec5..eb3d1173e3 100644 --- a/src/gsMpi/gsMpi.h +++ b/src/gsParallel/gsMpi.h @@ -25,11 +25,11 @@ // # warning "The MPI version is older than MPI-2." // # endif //#endif -#include -#include +#include +#include #endif -#include +#include namespace gismo { diff --git a/src/gsMpi/gsMpiComm.h b/src/gsParallel/gsMpiComm.h similarity index 100% rename from src/gsMpi/gsMpiComm.h rename to src/gsParallel/gsMpiComm.h diff --git a/src/gsMpi/gsMpiTraits.h b/src/gsParallel/gsMpiTraits.h similarity index 100% rename from src/gsMpi/gsMpiTraits.h rename to src/gsParallel/gsMpiTraits.h diff --git a/src/gsParallel/gsOpenMP.cpp b/src/gsParallel/gsOpenMP.cpp new file mode 100644 index 0000000000..ac8ff244aa --- /dev/null +++ b/src/gsParallel/gsOpenMP.cpp @@ -0,0 +1,440 @@ +/** @file gsOpenMP.cpp + + @brief Implementation of OpenMP stub routines to be used when libomp is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#if !defined(_OPENMP) + +#include +#include + +#include +#include +#include +#include +#include + +void omp_set_num_threads(int num_threads) +{} + +int omp_get_num_threads(void) +{ + return 1; +} + +int omp_get_max_threads(void) +{ + return 1; +} + +int omp_get_thread_num(void) +{ + return 0; +} + +int omp_get_num_procs(void) +{ + return 1; +} + +int omp_in_parallel(void) +{ + return 0; +} + +void omp_set_dynamic(int dynamic_threads) +{} + +int omp_get_dynamic(void) +{ + return 0; +} + +int omp_get_cancellation(void) +{ + return 0; +} + +void omp_set_nested(int nested) +{} + +int omp_get_nested(void) +{ + return 0; +} + +void omp_set_schedule(omp_sched_t kind, int chunk_size) +{} + +void omp_get_schedule(omp_sched_t *kind, int *chunk_size) +{ + *kind = omp_sched_static; + *chunk_size = 0; +} + +int omp_get_thread_limit(void) +{ + return 1; +} + +void omp_set_max_active_levels(int max_active_levels) +{} + +int omp_get_max_active_levels(void) +{ + return 0; +} + +int omp_get_level(void) +{ + return 0; +} + +int omp_get_ancestor_thread_num(int level) +{ + return level == 0 ? 0 : -1; +} + +int omp_get_team_size(int level) +{ + return level == 0 ? 1 : -1; +} + +int omp_get_active_level(void) +{ + return 0; +} + +int omp_in_final(void) +{ + return 1; +} + +omp_proc_bind_t omp_get_proc_bind(void) +{ + return omp_proc_bind_false; +} + +int omp_get_num_places(void) +{ + return 0; +} + +int omp_get_place_num_procs(int place_num) +{ + return 0; +} + +void omp_get_place_proc_ids(int place_num, int *ids) +{} + +int omp_get_place_num(void) +{ + return -1; +} + +int omp_get_partition_num_places(void) +{ + return 0; +} + +void omp_get_partition_place_nums(int *place_nums) +{} + +void omp_set_default_device(int device_num) +{} + +int omp_get_default_device(void) +{ + return 0; +} + +int omp_get_num_devices(void) +{ + return 0; +} + +int omp_get_num_teams(void) +{ + return 1; +} + +int omp_get_team_num(void) +{ + return 0; +} + +int omp_is_initial_device(void) +{ + return 1; +} + +int omp_get_initial_device(void) +{ + return -10; +} + +int omp_get_max_task_priority(void) +{ + return 0; +} + +void omp_init_lock(omp_lock_t *arg) +{ + arg->lock = UNLOCKED; +} + +void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) +{ + omp_init_lock(arg); +} + +void omp_destroy_lock(omp_lock_t *arg) +{ + arg->lock = INIT; +} + +void omp_set_lock(omp_lock_t *arg) +{ + if (arg->lock == UNLOCKED) + { + arg->lock = LOCKED; + } + else if (arg->lock == LOCKED) + { + fprintf(stderr, "error: deadlock in using lock variable\n"); + exit(1); + } + else + { + exit(1); + } +} + +void omp_unset_lock(omp_lock_t *arg) +{ + if (arg->lock == LOCKED) + { + arg->lock = UNLOCKED; + } + else if (arg->lock == UNLOCKED) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +int omp_test_lock(omp_lock_t *arg) +{ + if (arg->lock == UNLOCKED) + { + arg->lock = LOCKED; + return 1; + } + else if (arg->lock == LOCKED) + { + return 0; + } + else { + fprintf(stderr, "error: lock not initialized\n"); + exit(1); + } +} + +void omp_init_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = NOOWNER; + arg->count = 0; +} + +void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint) +{ + omp_init_nest_lock(arg); +} + +void omp_destroy_nest_lock(omp_nest_lock_t *arg) +{ + arg->owner = NOOWNER; + arg->count = UNLOCKED; +} + +void omp_set_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == MASTER && arg->count >= 1) + { + arg->count++; + } + else if (arg->owner == NOOWNER && arg->count == 0) + { + arg->owner = MASTER; + arg->count = 1; + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +void omp_unset_nest_lock(omp_nest_lock_t *arg) +{ + if (arg->owner == MASTER && arg->count >= 1) + { + arg->count--; + if (arg->count == 0) + { + arg->owner = NOOWNER; + } + } + else if (arg->owner == NOOWNER && arg->count == 0) + { + fprintf(stderr, "error: lock not set\n"); + exit(1); + } + else + { + fprintf(stderr, "error: lock corrupted or not initialized\n"); + exit(1); + } +} + +int omp_test_nest_lock(omp_nest_lock_t *arg) +{ + omp_set_nest_lock(arg); + return arg->count; +} + +double omp_get_wtime(void) +{ + /* This function does not provide a working + * wallclock timer. Replace it with a version + * customized for the target machine. + */ + return 0.0; +} + +double omp_get_wtick(void) +{ + /* This function does not provide a working + * clock tick function. Replace it with + * a version customized for the target machine. + */ + return 365. * 86400.; +} + +void * omp_target_alloc(size_t size, int device_num) +{ + if (device_num != -10) + return NULL; + return malloc(size); +} + +void omp_target_free(void *device_ptr, int device_num) +{ + free(device_ptr); +} + +int omp_target_is_present(void *ptr, int device_num) +{ + return 1; +} + +int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device) +{ + // only the default device is valid in a stub + if (dst_device != -10 || src_device != -10 + || ! dst || ! src ) + return EINVAL; + memcpy((char *)dst + dst_offset, + (char *)src + src_offset, + length); + return 0; +} + +int omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num) +{ + int ret=0; + // Both null, return number of dimensions supported, + // this stub supports an arbitrary number + if (dst == NULL && src == NULL) return INT_MAX; + + if (!volume || !dst_offsets || !src_offsets + || !dst_dimensions || !src_dimensions + || num_dims < 1 ) { + ret = EINVAL; + goto done; + } + if (num_dims == 1) { + ret = omp_target_memcpy(dst, src, + element_size * volume[0], + dst_offsets[0] * element_size, + src_offsets[0] * element_size, + dst_device_num, src_device_num); + if(ret) goto done; + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; i < num_dims; i++) { + dst_slice_size *= dst_dimensions[i]; + src_slice_size *= src_dimensions[i]; + } + size_t dst_off = dst_offsets[0] * dst_slice_size; + size_t src_off = src_offsets[0] * src_slice_size; + for (size_t i=0; i < volume[0]; i++) { + ret = omp_target_memcpy_rect( + (char *)dst + dst_off + dst_slice_size*i, + (char *)src + src_off + src_slice_size*i, + element_size, + num_dims - 1, + volume + 1, + dst_offsets + 1, + src_offsets + 1, + dst_dimensions + 1, + src_dimensions + 1, + dst_device_num, + src_device_num); + if (ret) goto done; + } + } +done: + return ret; +} + +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num) +{ + // No association is possible because all host pointers + // are considered present + return EINVAL; +} + +int omp_target_disassociate_ptr(void *ptr, int device_num) +{ + return EINVAL; +} +#endif // !defined(_OPENMP) diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h new file mode 100644 index 0000000000..b584de760c --- /dev/null +++ b/src/gsParallel/gsOpenMP.h @@ -0,0 +1,197 @@ +/** @file gsOpenMP.h + + @brief OpenMP stub routines to be used when omp.h is not available + + This file is part of the G+Smo library. + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + + Author(s): M. Moller +*/ + +#pragma once + +#ifdef _OPENMP + +#include + +#else + +#include + +void GISMO_EXPORT omp_set_num_threads(int num_threads); + +int GISMO_EXPORT omp_get_num_threads(void); + +int GISMO_EXPORT omp_get_max_threads(void); + +int GISMO_EXPORT omp_get_thread_num(void); + +int GISMO_EXPORT omp_get_num_procs(void); + +int GISMO_EXPORT omp_in_parallel(void); + +void GISMO_EXPORT omp_set_dynamic(int dynamic_threads); + +int GISMO_EXPORT omp_get_dynamic(void); + +int GISMO_EXPORT omp_get_cancellation(void); + +void GISMO_EXPORT omp_set_nested(int nested); + +int GISMO_EXPORT omp_get_nested(void); + +typedef enum omp_sched_t { + omp_sched_static = 1, + omp_sched_dynamic = 2, + omp_sched_guided = 3, + omp_sched_auto = 4, + omp_sched_monotonic = 0x80000000 +} omp_sched_t; + +void GISMO_EXPORT omp_set_schedule(omp_sched_t kind, int chunk_size); + +void GISMO_EXPORT omp_get_schedule(omp_sched_t *kind, int *chunk_size); + +int GISMO_EXPORT omp_get_thread_limit(void); + +void GISMO_EXPORT omp_set_max_active_levels(int max_active_levels); + +int GISMO_EXPORT omp_get_max_active_levels(void); + +int GISMO_EXPORT omp_get_level(void); + +int GISMO_EXPORT omp_get_ancestor_thread_num(int level); + +int GISMO_EXPORT omp_get_team_size(int level); + +int GISMO_EXPORT omp_get_active_level(void); + +int GISMO_EXPORT omp_in_final(void); + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +omp_proc_bind_t omp_get_proc_bind(void); + +int GISMO_EXPORT omp_get_num_places(void); + +int GISMO_EXPORT omp_get_place_num_procs(int place_num); + +void GISMO_EXPORT omp_get_place_proc_ids(int place_num, int *ids); + +int GISMO_EXPORT omp_get_place_num(void); + +int GISMO_EXPORT omp_get_partition_num_places(void); + +void GISMO_EXPORT omp_get_partition_place_nums(int *place_nums); + +void GISMO_EXPORT omp_set_default_device(int device_num); + +int GISMO_EXPORT omp_get_default_device(void); + +int GISMO_EXPORT omp_get_num_devices(void); + +int GISMO_EXPORT omp_get_num_teams(void); + +int GISMO_EXPORT omp_get_team_num(void); + +int GISMO_EXPORT omp_is_initial_device(void); + +int GISMO_EXPORT omp_get_initial_device(void); + +int GISMO_EXPORT omp_get_max_task_priority(void); + +typedef struct omp_lock_t { + int lock; +} omp_lock_t; + +enum { UNLOCKED = -1, INIT, LOCKED }; + +void GISMO_EXPORT omp_init_lock(omp_lock_t *arg); + +typedef enum omp_sync_hint_t { + omp_sync_hint_none = 0, + omp_lock_hint_none = omp_sync_hint_none, + omp_sync_hint_uncontended = 1, + omp_lock_hint_uncontended = omp_sync_hint_uncontended, + omp_sync_hint_contended = (1<<1), + omp_lock_hint_contended = omp_sync_hint_contended, + omp_sync_hint_nonspeculative = (1<<2), + omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative, + omp_sync_hint_speculative = (1<<3), + omp_lock_hint_speculative = omp_sync_hint_speculative, + kmp_lock_hint_hle = (1<<16), + kmp_lock_hint_rtm = (1<<17), + kmp_lock_hint_adaptive = (1<<18) +} omp_sync_hint_t; + +typedef omp_sync_hint_t omp_lock_hint_t; + +void GISMO_EXPORT omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint); + +void GISMO_EXPORT omp_destroy_lock(omp_lock_t *arg); + +void GISMO_EXPORT omp_set_lock(omp_lock_t *arg); + +void GISMO_EXPORT omp_unset_lock(omp_lock_t *arg); + +int GISMO_EXPORT omp_test_lock(omp_lock_t *arg); + +typedef struct omp_nest_lock_t { + int owner; + int count; +} omp_nest_lock_t; + +enum { NOOWNER = -1, MASTER = 0 }; + +void GISMO_EXPORT omp_init_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, + omp_lock_hint_t hint); + +void GISMO_EXPORT omp_destroy_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_set_nest_lock(omp_nest_lock_t *arg); + +void GISMO_EXPORT omp_unset_nest_lock(omp_nest_lock_t *arg); + +int GISMO_EXPORT omp_test_nest_lock(omp_nest_lock_t *arg); + +double omp_get_wtime(void); + +double omp_get_wtick(void); + +void * omp_target_alloc(size_t size, int device_num); + +void GISMO_EXPORT omp_target_free(void *device_ptr, int device_num); + +int GISMO_EXPORT omp_target_is_present(void *ptr, int device_num); + +int GISMO_EXPORT omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, + int dst_device, int src_device); + +int GISMO_EXPORT omp_target_memcpy_rect(void *dst, void *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device_num, int src_device_num); + +int GISMO_EXPORT omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, + int device_num); + +int GISMO_EXPORT omp_target_disassociate_ptr(void *ptr, int device_num); +#endif // _OPENMP diff --git a/src/gsUtils/gsUtils.h b/src/gsUtils/gsUtils.h index 01647cc89b..0997a55369 100644 --- a/src/gsUtils/gsUtils.h +++ b/src/gsUtils/gsUtils.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #ifdef __GNUC__ #include From bee9785bde122eda909f74402ca2495e4dce7d16 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 14:34:52 +0100 Subject: [PATCH 060/174] add possibility to read/write string in gsFileData --- .../src/IterativeLinearSolvers/IncompleteLUT.h | 7 +------ src/gsIO/gsFileData.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index edc9e556e5..24d4612e21 100644 --- a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -196,19 +196,14 @@ class IncompleteLUT : public SparseSolverBase m_P; // Fill-reducing permutation PermutationMatrix m_Pinv; // Inverse permutation }; diff --git a/src/gsIO/gsFileData.h b/src/gsIO/gsFileData.h index b9b77418a5..019c9725db 100644 --- a/src/gsIO/gsFileData.h +++ b/src/gsIO/gsFileData.h @@ -251,6 +251,22 @@ class gsFileData } } + /// Add a string to the Xml tree + void addString (const std::string & s) + { + gsXmlNode* node = internal::makeNode("string",s,*data); + data->appendToRoot(node); + } + + std::string getString () const + { + + gsXmlNode * node = getFirstNode("string"); + //node = getNextSibling(node, "string"); + std::string res( node->value() ); + return res; + } + /// Returns the size of the data size_t bufferSize() const { return m_buffer.size(); }; From b3ed1225665de63eca117da723998e6b7a3a8d40 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 14:42:42 +0100 Subject: [PATCH 061/174] update syntax --- examples/heatEquation2_example.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/examples/heatEquation2_example.cpp b/examples/heatEquation2_example.cpp index a86ccab184..05133d17fa 100644 --- a/examples/heatEquation2_example.cpp +++ b/examples/heatEquation2_example.cpp @@ -87,11 +87,6 @@ int main(int argc, char *argv[]) gsInfo<<"Active options:\n"<< K.options() <<"\n"; gsInfo<<"Active options:\n"<< M.options() <<"\n"; - typedef gsExprAssembler<>::geometryMap geometryMap; - typedef gsExprAssembler<>::variable variable; - typedef gsExprAssembler<>::space space; - typedef gsExprAssembler<>::solution solution; - K.setIntegrationElements(bases); M.setIntegrationElements(bases); @@ -99,32 +94,28 @@ int main(int argc, char *argv[]) gsExprEvaluator<> evM(M); // Set the geometry map - geometryMap G_K = K.getMap(patches); - geometryMap G_M = M.getMap(patches); + auto G_K = K.getMap(patches); + auto G_M = M.getMap(patches); // Set the discretization space - space u_K = K.getSpace(bases); - space u_M = M.getSpace(bases); - // u_K.setInterfaceCont(0); - // u_M.setInterfaceCont(0); - // u_K.addBc( bcInfo.get("Dirichlet") ); - // u_M.addBc( bcInfo.get("Dirichlet") ); + auto u_K = K.getSpace(bases); + auto u_M = M.getSpace(bases); u_K.setup(bcInfo, dirichlet::interpolation, 0); u_M.setup(bcInfo, dirichlet::interpolation, 0); // Set the source term - variable ff_K = K.getCoeff(f, G_K); - variable ff_M = M.getCoeff(f, G_M); + auto ff_K = K.getCoeff(f, G_K); + auto ff_M = M.getCoeff(f, G_M); - K.initSystem(false); - M.initSystem(false); + K.initSystem(); + M.initSystem(); K.assemble( igrad(u_K, G_K) * igrad(u_K, G_K).tr() * meas(G_K), u_K * ff_K * meas(G_K) ); M.assemble( u_M * u_M.tr() * meas(G_M), u_M * ff_M * meas(G_M) ); // Enforce Neumann conditions to right-hand side - variable g_Neumann = K.getBdrFunction(); - K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bcInfo.neumannSides() ); + auto g_Neumann = K.getBdrFunction(G_K); + K.assembleBdr(bcInfo.get("Neumann"), u_K * g_Neumann.val() * nv(G_K).norm() ); // A Conjugate Gradient linear solver with a diagonal (Jacobi) preconditionner gsSparseSolver<>::CGDiagonal solver; From 662db95a3d7574ca9b266689a62b7fb799a8fca9 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 15:06:39 +0100 Subject: [PATCH 062/174] small fix --- src/gsIO/gsBenchmark.h | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 8f58d135c7..f7482cfd32 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -35,11 +35,6 @@ enum metric { perf_tflop_sec, runtime_sec, }; - -/** - * Benchmark result - */ -typedef std::array gsBenchmarkResult; /** * Benchmark: driver function @@ -52,7 +47,13 @@ typedef std::array gsBenchmarkResult; class GISMO_EXPORT gsBenchmark { public: - /** + +/** + * Benchmark result + */ +typedef std::array Result; + +/** * Benchmark result set class */ class gsBenchmarkResultSet @@ -60,7 +61,7 @@ class GISMO_EXPORT gsBenchmark public: gsBenchmarkResultSet(const std::string& label, const std::string& title, - const std::vector& results) + const std::vector& results) : label(label), title(title), results(results) @@ -73,14 +74,14 @@ class GISMO_EXPORT gsBenchmark const std::string& get_title() const { return title; } - const std::vector& get() const + const std::vector& get() const { return results; } std::ostream &print(std::ostream &os) const; private: const std::string label, title; - std::vector results; + std::vector results; }; /** @@ -104,7 +105,7 @@ class GISMO_EXPORT gsBenchmark void add(const std::string& label, const std::string& title, - const std::vector& results) + const std::vector& results) { this->results.emplace_back(new gsBenchmarkResultSet(label+std::string(1,id++), title, results)); @@ -147,14 +148,14 @@ class GISMO_EXPORT gsBenchmark std::ostream &print(std::ostream &os) const; template - static std::vector + static std::vector run(const std::vector& nthreads, int nruns, T& benchmark, metric metric) { gsStopwatch stopwatch; std::size_t benchmark_result; double benchmark_metric, benchmark_runtime; - std::vector results; + std::vector results; try { for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { @@ -190,13 +191,13 @@ class GISMO_EXPORT gsBenchmark } } - - results.push_back( - { static_cast(*it) /* number of OpenMP threads */, - benchmark_runtime/(double)nruns /* averaged elapsed time in seconds */, - benchmark_metric/(double)nruns /* averaged benchmark metric */, - (double)metric /* benchmark metric */ - }); + + Result res; + res[0]= static_cast(*it); // number of OpenMP threads + res[0]= benchmark_runtime/(double)nruns; // averaged elapsed time in seconds + res[0]= benchmark_metric/(double)nruns; // averaged benchmark metric + res[0]= (double)metric; // benchmark metric + results.push_back( give(res) ); } } catch(...) {} From 90c137e9884ab92a0f93b442d5235e612c484c01 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 15:33:29 +0100 Subject: [PATCH 063/174] update cmake --- CMakeLists.txt | 7 +---- cmake/AddCXXCompileOptions.cmake | 49 +++++++++++++++++--------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a321f543a3..026f422108 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,12 +48,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() endif() -if(NOT CMAKE_CXX_STANDARD) - # Set default C++ standard to 11 - set(CMAKE_CXX_STANDARD 11 CACHE STRING - "C++ standard (11 14 17 20)" FORCE) - set_property(CACHE CMAKE_CXX_STANDARD PROPERTY STRINGS "11" "14" "17" "20") -endif() +set(CMAKE_CXX_STANDARD_DEFAULT 11) set(gismo_VERSION_MAJOR 21) #year set(gismo_VERSION_MINOR 12) #month diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index 51c5aa21de..23d7f02e97 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -5,21 +5,10 @@ ## Authors: M. Moeller and A. Mantzaflaris ###################################################################### -# We strongly recommend to use an up-to-date cmake version which -# provides support for the most recent compiler version. We provide a -# subset of compiler options copied from cmake 3.17.5. -# -# The options below are only used if -# CMAKE_CXXvv_STANDARD_COMPILE_OPTIONS and -# CMAKE_CXXvv_EXTENSION_COMPILE_OPTIONS are not yet set by the regular -# cmake routines, where vv is the value of CMAKE_CXX_STANDARD. - -if(NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR - NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) - - message(WARNING "Update your CMake installation! We fall back to - compiler options back ported from CMake 3.17.5") - +set(CMAKE_CXX_STANDARD_DEFAULT 11) + +if (CMAKE_VERSION VERSION_LESS "3.1") + if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") # AppleClang @@ -478,15 +467,29 @@ if(NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR endif() endif() - else() + endif() - message(FATAL_ERROR "Unsupported compiler ${CMAKE_CXX_COMPILER_ID}") - - endif() - +endif() # cmake 3.1 + +if (NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") endif() -if (NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR - NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) - message(FATAL_ERROR "Unsupported C++ standard") +# Apply for Cmake less than 3.1 +if (CMAKE_VERSION VERSION_LESS "3.1") + + if ( NOT "x${CMAKE_CXX_STANDARD}" STREQUAL "x98" AND + ${CMAKE_CXX_STANDARD_DEFAULT} LESS ${CMAKE_CXX_STANDARD}) + message(STATUS "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT}, CXX_STANDARD choice is changed.") + set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION}") +endif()#cmake<3.1 + + +# Bugfix for windows/msvc systems +if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") endif() From ec9ce9f818a0406740db1fccf5cea0214d7f7368 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 15:38:10 +0100 Subject: [PATCH 064/174] small fix --- .circleci/config.yml | 2 +- src/gsIO/gsBenchmark.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 785f029be4..ca43098248 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,4 +80,4 @@ workflows: - macos_x86_64_xcode10_cxx98_release - macos_x86_64_xcode11_cxx11_release - macos_x86_64_xcode12_cxx14_release - - macos_x86_64_xcode13_cxx17_release \ No newline at end of file + - macos_x86_64_xcode13_cxx17_release diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index f7482cfd32..609d1763bb 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -194,9 +194,9 @@ typedef std::array Result; Result res; res[0]= static_cast(*it); // number of OpenMP threads - res[0]= benchmark_runtime/(double)nruns; // averaged elapsed time in seconds - res[0]= benchmark_metric/(double)nruns; // averaged benchmark metric - res[0]= (double)metric; // benchmark metric + res[1]= benchmark_runtime/(double)nruns; // averaged elapsed time in seconds + res[2]= benchmark_metric/(double)nruns; // averaged benchmark metric + res[3]= (double)metric; // benchmark metric results.push_back( give(res) ); } } catch(...) {} From 8e39b85dcabf63878df8c4b71d8eca007636fed5 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 18:18:18 +0100 Subject: [PATCH 065/174] fixes --- .github/workflows/gismo.yml | 2 +- cmake/AddCXXCompileOptions.cmake | 4 ++-- cmake/gsOptions.cmake | 6 +++--- src/gsIO/gsBenchmark.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/gismo.yml b/.github/workflows/gismo.yml index 29bfda99b3..718e8b7a3b 100644 --- a/.github/workflows/gismo.yml +++ b/.github/workflows/gismo.yml @@ -36,4 +36,4 @@ jobs: # Note the current convention is to use the -S and -B options here to specify source # and build directories, but this is only available with CMake 3.13 and higher. # The CMake binaries on the Github Actions machines are (as of this writing) 3.12 - run: ctest -S gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="actions_$GITHUB_RUN_NUMBER" -D CTEST_SITE="${{ matrix.os }}_[actions]" -D CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE;-DCMAKE_CXX_STANDARD=11;-DGISMO_EXTRA_DEBUG=ON;-DGISMO_WITH_ONURBS=ON;-DGISMO_BUILD_UNITTESTS=ON" -Q + run: ctest -S gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="actions_$GITHUB_RUN_NUMBER" -D CTEST_SITE="${{ matrix.os }}_[actions]" -D CMAKE_ARGS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE;-DCMAKE_CXX_STANDARD=14;-DGISMO_EXTRA_DEBUG=ON;-DGISMO_WITH_ONURBS=ON;-DGISMO_BUILD_UNITTESTS=ON" -Q diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index 23d7f02e97..ff1ba44377 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -7,7 +7,7 @@ set(CMAKE_CXX_STANDARD_DEFAULT 11) -if (CMAKE_VERSION VERSION_LESS "3.1") +#if (CMAKE_VERSION VERSION_LESS "3.1") if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") @@ -469,7 +469,7 @@ if (CMAKE_VERSION VERSION_LESS "3.1") endif() -endif() # cmake 3.1 +#endif() # cmake 3.1 if (NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index b492451bea..752616a0ea 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -19,11 +19,11 @@ if(EXISTS "${CMAKE_SOURCE_DIR}/.git") endif() endif() message (" CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") -message (" CMAKE_C_COMPILER ${CMAKE_C_COMPILER}") -message (" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") +#message (" CMAKE_C_COMPILER ${CMAKE_C_COMPILER}") +#message (" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") message (" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}") message (" CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}") -message (" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") +#message (" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message (" GISMO_COEFF_TYPE ${GISMO_COEFF_TYPE}") message (" GISMO_INDEX_TYPE ${GISMO_INDEX_TYPE}") diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 609d1763bb..d3a945f328 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -209,7 +209,7 @@ typedef std::array Result; }; /// Print (as string) operator -std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) +inline std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) { return obj.print(os); } } // namespace gismo From 4e47ad91fa1f5cc6761271ea6641da1102cdec9a Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 20:12:38 +0100 Subject: [PATCH 066/174] fix --- src/gsCore/gsForwardDeclarations.h | 1 + src/gsIO/gsBenchmark.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gsCore/gsForwardDeclarations.h b/src/gsCore/gsForwardDeclarations.h index 1631344bca..2358800e21 100644 --- a/src/gsCore/gsForwardDeclarations.h +++ b/src/gsCore/gsForwardDeclarations.h @@ -15,6 +15,7 @@ // STD includes #include +#include #include #include #include diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 251216e883..b0ea152c44 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -23,7 +23,7 @@ namespace gismo << "threads & " << label << " \\\\\n"; for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)[0] << "&" << (*it)[2] << "\\\\\n"; + os << it->at(0) << "&" << it->at(2) << "\\\\\n"; os << "}\\data" << label << "\n"; From bfcfcc9e016d1da318731a2d41a929d439f1e4ca Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Sun, 5 Dec 2021 22:04:27 +0100 Subject: [PATCH 067/174] fixes --- examples/gsExpressions_test.cpp | 3 +-- examples/performance_benchmark.cpp | 2 +- src/gsIO/gsBenchmark.cpp | 22 +++++++++++++--------- src/gsIO/gsBenchmark.h | 2 -- src/gsParallel/gsMpiComm.h | 8 ++++---- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/examples/gsExpressions_test.cpp b/examples/gsExpressions_test.cpp index e357748778..250811759b 100644 --- a/examples/gsExpressions_test.cpp +++ b/examples/gsExpressions_test.cpp @@ -82,7 +82,6 @@ int main(int argc, char *argv[]) [V] symmetrize_expr */ -# define M_PI 3.14159265358979323846 # define M_R 1.0 bool verbose = false; @@ -297,7 +296,7 @@ int main(int argc, char *argv[]) /// NOTE: Tolerance is lower! gsInfo<< "* Area (integral):\t"; real_t num = ev.integral( meas(G) ); - real_t ref = 4*M_PI*M_R*M_R; + real_t ref = 4*EIGEN_PI*M_R*M_R; if (verbose) gsInfo <<"Result:\n"< -#include #include #include @@ -132,6 +131,7 @@ class benchmark_c_array_axpy { delete[] m_x; delete[] m_y; + delete[] m_z; } std::size_t operator()() diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index b0ea152c44..af9ea861c1 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -12,6 +12,8 @@ */ #include +#include +#include #include namespace gismo @@ -40,18 +42,20 @@ namespace gismo << "name=MyAxis,\n" << "width=\\textwidth,\n" << "height=.5\\textwidth,\n" - << "legend pos=outer north east,\n" - + << "legend pos=outer north east,\n" << "symbolic x coords={"; - - for (auto it=(*results.cbegin())->get().cbegin(); - it!=(*results.cbegin())->get().cend(); ++it) - os << (*it)[0] << (it!=(*results.cbegin())->get().cend()-1 ? "," : ""); + + //std::vector >::const_iterator + auto it = results.front()->get().cbegin(); + auto ite = results.front()->get().cend(); + for (;it!=ite; ++it) + os << it->at(0) << (it!=ite-1 ? "," : ""); os << "},\n" - << "xlabel={OpenMP threads},\n"; - - switch((metric)(*(*results.cbegin())->get().cbegin())[4]) { + + it = results.front()->get().cbegin(); + switch( (metric)it->at(3) ) + { case metric::bandwidth_kb_sec: os << "ylabel={Bandwidth in KB/s},\n"; break; diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index d3a945f328..eedd55c66e 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -14,8 +14,6 @@ #pragma once #include -#include -#include #include namespace gismo diff --git a/src/gsParallel/gsMpiComm.h b/src/gsParallel/gsMpiComm.h index bc63bef716..1c2dee0665 100644 --- a/src/gsParallel/gsMpiComm.h +++ b/src/gsParallel/gsMpiComm.h @@ -389,7 +389,7 @@ class gsSerialComm operator< */ template - static T min (T& in) + static T (min) (T& in) { return in; } @@ -399,7 +399,7 @@ class gsSerialComm in every process. Assumes that T has an operator< */ template - static int min (T* inout, int len) + static int (min) (T* inout, int len) { return 0; } @@ -409,7 +409,7 @@ class gsSerialComm operator< */ template - static T max (T& in) + static T (max) (T& in) { return in; } @@ -419,7 +419,7 @@ class gsSerialComm process. Assumes that T has an operator< */ template - static int max (T* inout, int len) + static int (max) (T* inout, int len) { return 0; } From f46c45817e9b5f348121e73977e8f4d115cf017f Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Mon, 6 Dec 2021 07:51:19 +0100 Subject: [PATCH 068/174] some fixes --- src/gsAssembler/gsAdaptiveMeshing.hpp | 25 +++++++++---------------- src/gsIO/.#gsOptionList.h | 1 - src/gsIO/gsBenchmark.h | 24 ++++++++++++------------ src/gsIO/gsCmdLine.h | 11 +++++------ src/gsIO/gsOptionList.cpp | 19 ++++++++++++++++++- src/gsIO/gsOptionList.h | 23 ----------------------- 6 files changed, 44 insertions(+), 59 deletions(-) delete mode 120000 src/gsIO/.#gsOptionList.h diff --git a/src/gsAssembler/gsAdaptiveMeshing.hpp b/src/gsAssembler/gsAdaptiveMeshing.hpp index 1203c76f34..e84d9a3068 100644 --- a/src/gsAssembler/gsAdaptiveMeshing.hpp +++ b/src/gsAssembler/gsAdaptiveMeshing.hpp @@ -430,14 +430,12 @@ void gsAdaptiveMeshing::_refineMarkedElements( gsFunctionSet * input, } } - gsMultiPatch * mp; - gsMultiBasis * mb; - if ((mp = dynamic_cast*>(input))) + if (gsMultiPatch * mp = dynamic_cast*>(input) ) { std::vector elements = mp->patch(pn).basis().asElements(refBoxes, refExtension); mp->patch(pn).refineElements( elements ); } - else if ((mb = dynamic_cast*>(input))) + else if (gsMultiBasis * mb = dynamic_cast*>(input) ) { mb->refine( pn, refBoxes, refExtension ); } @@ -497,15 +495,13 @@ void gsAdaptiveMeshing::_unrefineMarkedElements( gsFunctionSet * input, } } - gsMultiPatch * mp; - gsMultiBasis * mb; - if ((mp = dynamic_cast*>(input))) + if (gsMultiPatch * mp = dynamic_cast*>(input)) { // Refine all of the found refBoxes in this patch std::vector elements = mp->patch(pn).basis().asElementsUnrefine(refBoxes, extension); mp->patch(pn).unrefineElements( elements ); } - else if ((mb = dynamic_cast*>(input))) + else if (gsMultiBasis * mb = dynamic_cast*>(input) ) { // Refine all of the found refBoxes in this patch mb->unrefine( pn, refBoxes, extension ); @@ -584,9 +580,7 @@ void gsAdaptiveMeshing::_processMarkedElements(gsFunctionSet * input, globalCount++; } - gsMultiPatch * mp; - gsMultiBasis * mb; - if ((mp = dynamic_cast*>(input))) + if (gsMultiPatch * mp = dynamic_cast*>(input)) { std::vector elements; // Unrefine all of the found refBoxes in this patch @@ -597,7 +591,7 @@ void gsAdaptiveMeshing::_processMarkedElements(gsFunctionSet * input, elements = mp->patch(pn).basis().asElements(refBoxes, refExtension); mp->patch(pn).refineElements( elements ); } - else if ((mb = dynamic_cast*>(input))) + else if (gsMultiBasis * mb = dynamic_cast*>(input) ) { // Refine all of the found refBoxes in this patch mb->unrefine( pn, crsBoxes, crsExtension); @@ -693,13 +687,12 @@ void gsAdaptiveMeshing::_flattenElements( gsFunctionSet * input, // } // gsDebug<<"\n"; } - gsMultiPatch * mp; - gsMultiBasis * mb; - if ((mp = dynamic_cast*>(input))) + + if (gsMultiPatch * mp = dynamic_cast*>(input) ) { mp->patch(pn).unrefineElements( elements ); } - else if ((mb = dynamic_cast*>(input))) + else if (gsMultiBasis * mb = dynamic_cast*>(input) ) { // Refine all of the found refBoxes in this patch mb->unrefineElements(pn, elements ); diff --git a/src/gsIO/.#gsOptionList.h b/src/gsIO/.#gsOptionList.h deleted file mode 120000 index 5dc4fea681..0000000000 --- a/src/gsIO/.#gsOptionList.h +++ /dev/null @@ -1 +0,0 @@ -amantzaf@tarragon.2106416:1634801095 \ No newline at end of file diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index eedd55c66e..7b5a895faa 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -88,11 +88,11 @@ typedef std::array Result; class gsBenchmarkSet { public: - gsBenchmarkSet(const std::string& label, - const std::string& title) + gsBenchmarkSet(const std::string& _label, + const std::string& _title) : id('A'), - label(label), - title(title) + label(_label), + title(_title) {} ~gsBenchmarkSet() @@ -101,12 +101,12 @@ typedef std::array Result; delete (*it); } - void add(const std::string& label, - const std::string& title, - const std::vector& results) + void add(const std::string& _label, + const std::string& _title, + const std::vector& _results) { - this->results.emplace_back(new gsBenchmarkResultSet(label+std::string(1,id++), - title, results)); + this->results.emplace_back(new gsBenchmarkResultSet(_label+std::string(1,id++), + _title, _results)); } const std::string& get_label() const @@ -133,10 +133,10 @@ typedef std::array Result; delete (*it); } - gsBenchmarkSet* add(const std::string& label, - const std::string& title) + gsBenchmarkSet* add(const std::string& _label, + const std::string& _title) { - benchmarks.emplace_back(new gsBenchmarkSet(label, title)); + benchmarks.emplace_back(new gsBenchmarkSet(_label, _title)); return benchmarks.back(); } diff --git a/src/gsIO/gsCmdLine.h b/src/gsIO/gsCmdLine.h index b6e972b457..a89417af6e 100644 --- a/src/gsIO/gsCmdLine.h +++ b/src/gsIO/gsCmdLine.h @@ -315,15 +315,14 @@ class GISMO_EXPORT gsXml private: gsXml(); public: - GSXML_COMMON_FUNCTIONS(gsOptionList) - GSXML_GET_POINTER(gsOptionList) + GSXML_COMMON_FUNCTIONS(gsCmdLine) static std::string tag () { return "OptionList"; } - static std::string type() { return ""; } + static gsCmdLine * get (gsXmlNode * node) {GISMO_ERROR("no get");} - static void get_into(gsXmlNode * node, gsOptionList & result) + static void get_into(gsXmlNode * node, gsCmdLine & result) { gsXml::get_into(node,result); } - static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data) - { return gsXml::put(obj,data); } + static gsXmlNode * put (const gsCmdLine & obj, gsXmlTree & data) + { return gsXml::put(obj,data); } }; } diff --git a/src/gsIO/gsOptionList.cpp b/src/gsIO/gsOptionList.cpp index 8a6cbb4df7..ce1a089cfe 100644 --- a/src/gsIO/gsOptionList.cpp +++ b/src/gsIO/gsOptionList.cpp @@ -505,6 +505,24 @@ bool gsOptionList::isSwitch(const std::string & label) const namespace internal { +/** \brief Read OptionList from XML data + \ingroup IO +*/ +template<> +class GISMO_EXPORT gsXml +{ +private: + gsXml(); +public: + GSXML_COMMON_FUNCTIONS(gsOptionList) + GSXML_GET_POINTER(gsOptionList) + static std::string tag () { return "OptionList"; } + static std::string type() { return ""; } + + static void get_into(gsXmlNode * node, gsOptionList & result); + static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data); +}; + void gsXml::get_into(gsXmlNode * node, gsOptionList & result) { // get all child-nodes @@ -629,7 +647,6 @@ gsXml::put (const gsOptionList & obj, gsXmlTree & data) return optionList; } - } // namespace internal #ifdef GISMO_BUILD_PYBIND11 diff --git a/src/gsIO/gsOptionList.h b/src/gsIO/gsOptionList.h index beb62a6faf..4f8095768c 100644 --- a/src/gsIO/gsOptionList.h +++ b/src/gsIO/gsOptionList.h @@ -284,29 +284,6 @@ inline bool operator< ( const gsOptionList::OptionListEntry& a, const gsOptionLi { return a.label < b.label; } -namespace internal -{ - -/** \brief Read OptionList from XML data - \ingroup IO -*/ -template<> -class GISMO_EXPORT gsXml -{ -private: - gsXml(); -public: - GSXML_COMMON_FUNCTIONS(gsOptionList) - GSXML_GET_POINTER(gsOptionList) - static std::string tag () { return "OptionList"; } - static std::string type() { return ""; } - - static void get_into(gsXmlNode * node, gsOptionList & result); - static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data); -}; - -} - #ifdef GISMO_BUILD_PYBIND11 /** From f0c90a5b50a129f7bf5d82f36b7c05d71dd3d14b Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Mon, 6 Dec 2021 08:22:43 +0100 Subject: [PATCH 069/174] fix --- src/gsIO/gsOptionList.cpp | 18 ------------------ src/gsIO/gsOptionList.h | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/gsIO/gsOptionList.cpp b/src/gsIO/gsOptionList.cpp index ce1a089cfe..1282da95f8 100644 --- a/src/gsIO/gsOptionList.cpp +++ b/src/gsIO/gsOptionList.cpp @@ -505,24 +505,6 @@ bool gsOptionList::isSwitch(const std::string & label) const namespace internal { -/** \brief Read OptionList from XML data - \ingroup IO -*/ -template<> -class GISMO_EXPORT gsXml -{ -private: - gsXml(); -public: - GSXML_COMMON_FUNCTIONS(gsOptionList) - GSXML_GET_POINTER(gsOptionList) - static std::string tag () { return "OptionList"; } - static std::string type() { return ""; } - - static void get_into(gsXmlNode * node, gsOptionList & result); - static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data); -}; - void gsXml::get_into(gsXmlNode * node, gsOptionList & result) { // get all child-nodes diff --git a/src/gsIO/gsOptionList.h b/src/gsIO/gsOptionList.h index 4f8095768c..d42a30b81f 100644 --- a/src/gsIO/gsOptionList.h +++ b/src/gsIO/gsOptionList.h @@ -283,6 +283,26 @@ inline std::ostream &operator<<(std::ostream &os, const gsOptionList::OptionList inline bool operator< ( const gsOptionList::OptionListEntry& a, const gsOptionList::OptionListEntry& b ) { return a.label < b.label; } +namespace internal +{ +/** \brief Read OptionList from XML data + \ingroup IO +*/ +template<> +class GISMO_EXPORT gsXml +{ +private: + gsXml(); +public: + GSXML_COMMON_FUNCTIONS(gsOptionList) + GSXML_GET_POINTER(gsOptionList) + static std::string tag () { return "OptionList"; } + static std::string type() { return ""; } + + static void get_into(gsXmlNode * node, gsOptionList & result); + static gsXmlNode * put (const gsOptionList & obj, gsXmlTree & data); +}; +} #ifdef GISMO_BUILD_PYBIND11 From b1d086b1d455d9de79d209282b7d1b8f7194d9b5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 6 Dec 2021 09:28:37 +0100 Subject: [PATCH 070/174] small fix --- src/gsIO/gsBenchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index af9ea861c1..15dfbcbac3 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -119,7 +119,7 @@ namespace gismo << "CPU " << gsSysInfo::getCpuInfo() << ", " << "Memory " << gsSysInfo::getMemoryInfo() << ", "; - gsJITCompilerConfig jit; jit.load("config/jit.xml"); + gsJITCompilerConfig jit; jit.load(GISMO_CONFIG_DIR "jit.xml"); std::string flags = jit.getFlags(); os << "Compiler flags "; From 94148c97c34250c375bda4f401e7ca01a0297e56 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Mon, 6 Dec 2021 11:42:12 +0100 Subject: [PATCH 071/174] small fix --- src/gsIO/gsBenchmark.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 15dfbcbac3..c17d5409b9 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -54,7 +54,7 @@ namespace gismo << "xlabel={OpenMP threads},\n"; it = results.front()->get().cbegin(); - switch( (metric)it->at(3) ) + switch( (int)it->at(3) ) { case metric::bandwidth_kb_sec: os << "ylabel={Bandwidth in KB/s},\n"; @@ -90,16 +90,16 @@ namespace gismo os << "title={" << title << "},\n" << "]"; - for (auto it=results.cbegin(); it!=results.cend(); ++it) + for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) os << "\\addplot table[x=threads,y=" - << (*it)->get_label() + << (*rit)->get_label() << "]{\\data" - << (*it)->get_label() + << (*rit)->get_label() << "};\n"; os << "\\legend{"; - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << (*it)->get_title() << (it!=results.cend()-1 ? "," : ""); + for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) + os << (*rit)->get_title() << (rit!=results.cend()-1 ? "," : ""); os << "}\n" << "\\end{semilogyaxis}\n" From e944623cfba15db8f885b5a58d08ea76e14ff6dd Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Mon, 6 Dec 2021 13:16:31 +0100 Subject: [PATCH 072/174] remove 98 build --- .circleci/config.yml | 19 ------------------- cmake/AddCXXCompileOptions.cmake | 10 ++++------ 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ca43098248..46d61f3020 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,24 +1,6 @@ version: 2.0 jobs: - macos_x86_64_xcode10_cxx98_release: - macos: - xcode: "10.3.0" - working_directory: ~/gismo - environment: - MAKEJOBS: 4 - steps: - - run: - name: Install dependencies - command: git -C /usr/local/Homebrew/Library/Taps/homebrew/homebrew-core fetch --unshallow; git -C /usr/local/Homebrew/Library/Taps/homebrew/homebrew-cask fetch --unshallow; brew update; brew install cmake - - checkout - - run: - name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode10_cxx98_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=98 -DGISMO_WITH_ONURBS=ON - - run: - name: Build and test G+Smo on MacOS - command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - macos_x86_64_xcode11_cxx11_release: macos: xcode: "11.7.0" @@ -77,7 +59,6 @@ workflows: version: 2 build: jobs: - - macos_x86_64_xcode10_cxx98_release - macos_x86_64_xcode11_cxx11_release - macos_x86_64_xcode12_cxx14_release - macos_x86_64_xcode13_cxx17_release diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index ff1ba44377..ae21172478 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -49,13 +49,11 @@ set(CMAKE_CXX_STANDARD_DEFAULT 11) set(CMAKE_CXX20_STANDARD_COMPILE_OPTION "-std=c++2a") set(CMAKE_CXX20_EXTENSION_COMPILE_OPTION "-std=gnu++2a") endif() - - elseif("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xARMClang" OR - "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" OR - "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xFujitsuClang") - + + elseif("x${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang$") + # ARMClang/Clang - + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 2.1) set(CMAKE_CXX98_STANDARD_COMPILE_OPTION "-std=c++98") set(CMAKE_CXX98_EXTENSION_COMPILE_OPTION "-std=gnu++98") From b0cd373da0bc2e970f4db7dace96ca0d2a6a2109 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 6 Dec 2021 13:22:08 +0100 Subject: [PATCH 073/174] small fixes --- src/gsCore/gsSysInfo.cpp | 33 ++++++++++++--------------------- src/gsCore/gsSysInfo.h | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index dcdfc92939..5f663f7918 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -12,21 +12,6 @@ */ #include -#include -#include - -#if defined(_WIN32) || defined(_WIN64) -# include -#elif __APPLE__ -# include -# include -#elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) -# include -# include -# endif -#elif __unix__ -#endif namespace gismo { @@ -542,7 +527,7 @@ namespace gismo } #elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) char CPUBrandString[0x40]; unsigned int CPUInfo[4] = {0,0,0,0}; @@ -565,7 +550,14 @@ namespace gismo } return CPUBrandString; - + +# else + + char hostname[HOST_NAME_MAX + 1]; + gethostname(hostname, HOST_NAME_MAX + 1); + + return "Unknown-CPU ["+hostname+"]"; + # endif #elif __unix__ #endif @@ -589,16 +581,15 @@ namespace gismo } #elif __linux__ -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) ) - + long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGE_SIZE); return util::to_string(pages * page_size / 1024 / 1024)+" MB"; - + # endif #elif __unix__ #endif - + return "Unknown-Memory"; } diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h index b2ac9366ab..1eff548c3b 100644 --- a/src/gsCore/gsSysInfo.h +++ b/src/gsCore/gsSysInfo.h @@ -14,9 +14,26 @@ #pragma once #include +#include +#include #include +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ +# include +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || define(__SUNCC_PRO)) +# include +# else +# include +# endif +#elif __unix__ +#endif + namespace gismo { From bb037f440fa1ec5129e6c519cae8f503f5f2388b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 6 Dec 2021 13:29:48 +0100 Subject: [PATCH 074/174] small fixes --- examples/performance_benchmark.cpp | 16 ++++++++-------- src/gsCore/gsSysInfo.cpp | 1 - src/gsCore/gsSysInfo.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index f36903821a..5508b459df 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -377,7 +377,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array memcopy\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_c_array_memcopy benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -391,7 +391,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== gsVector memcopy\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_eigen_vector_memcopy benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -408,7 +408,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array dot-product\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_c_array_dotproduct benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -422,7 +422,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== gsVector dot-product\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_eigen_vector_dotproduct benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -439,7 +439,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array AXPY\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_c_array_axpy benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -453,7 +453,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== gsVector AXPY\n"; for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_eigen_vector_axpy benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -470,7 +470,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== Native C array dense matrix-vector multiplication\n"; for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { - gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=dsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_c_array_dense_matmul benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); @@ -484,7 +484,7 @@ int main(int argc, char *argv[]) { gsInfo << "=== gsMatrix/gsVector dense matrix-vector multiplication\n"; for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { - gsInfo << (*it) << (it!=dsizes.cend()-1 ? "." : "\n") << std::flush; + gsInfo << (*it) << (it!=dsizes.cend()-1 ? "..." : "\n") << std::flush; try { benchmark_eigen_vector_dense_matmul benchmark(*it); auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index 5f663f7918..b444b91742 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -586,7 +586,6 @@ namespace gismo long page_size = sysconf(_SC_PAGE_SIZE); return util::to_string(pages * page_size / 1024 / 1024)+" MB"; -# endif #elif __unix__ #endif diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h index 1eff548c3b..55984fca4a 100644 --- a/src/gsCore/gsSysInfo.h +++ b/src/gsCore/gsSysInfo.h @@ -26,7 +26,7 @@ # include #elif __linux__ # include -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || define(__SUNCC_PRO)) +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) # include # else # include From 1957f1709033f2c0d36bc7f117abfb66015a34f8 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Mon, 6 Dec 2021 14:26:04 +0100 Subject: [PATCH 075/174] fix for windows --- src/gsMatrix/gsSparseMatrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsMatrix/gsSparseMatrix.h b/src/gsMatrix/gsSparseMatrix.h index ded60c1305..2cdc19e33a 100644 --- a/src/gsMatrix/gsSparseMatrix.h +++ b/src/gsMatrix/gsSparseMatrix.h @@ -400,7 +400,7 @@ class gsSparseMatrix : public Eigen::SparseMatrix return result; } - gsVector nonZerosPerInner(index_t upto = std::numeric_limits::max()) const + gsVector nonZerosPerInner(index_t upto = 2000000000) const { upto = math::min(upto, this->cols()); gsVector nz(upto); From a82b82791fa94986947c2052c09e20813023431d Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 11:32:34 +0100 Subject: [PATCH 076/174] fix for macos --- CMakeLists.txt | 13 ++++++++++- cmake/AddCXXCompileOptions.cmake | 38 ++++++++++++++------------------ 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 026f422108..691e309161 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,18 @@ if(NOT CMAKE_BUILD_TYPE) endif() endif() -set(CMAKE_CXX_STANDARD_DEFAULT 11) +if(NOT CMAKE_CXX_STANDARD) + # Set default C++ standard + if (NOT CMAKE_CXX_STANDARD_DEFAULT) + set(CMAKE_CXX_STANDARD 11 CACHE STRING + "C++ standard (98, 11, 14, 17, 20)" FORCE) + else() + set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE STRING + "C++ standard (98, 11, 14, 17, 20)" FORCE) + endif() + set_property(CACHE CMAKE_CXX_STANDARD PROPERTY STRINGS "98" "11" "14" + "17" "20") +endif() set(gismo_VERSION_MAJOR 21) #year set(gismo_VERSION_MINOR 12) #month diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index ae21172478..f4628a4f53 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -1,13 +1,24 @@ ###################################################################### -## AddCXXConpileOptions.cmake +## AddCXXCompileOptions.cmake ## This file is part of the G+Smo library. ## ## Authors: M. Moeller and A. Mantzaflaris ###################################################################### -set(CMAKE_CXX_STANDARD_DEFAULT 11) +# We strongly recommend to use an up-to-date cmake version which +# provides support for the most recent compiler version. We provide a +# subset of compiler options copied from cmake 3.17.5. +# +# The options below are only used if +# CMAKE_CXXvv_STANDARD_COMPILE_OPTIONS and +# CMAKE_CXXvv_EXTENSION_COMPILE_OPTIONS are not yet set by the regular +# cmake routines, where vv is the value of CMAKE_CXX_STANDARD. -#if (CMAKE_VERSION VERSION_LESS "3.1") +if(NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR + NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) + + message(WARNING "Update your CMake installation! We fall back to + compiler options back ported from CMake 3.17.5") if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") @@ -467,27 +478,10 @@ set(CMAKE_CXX_STANDARD_DEFAULT 11) endif() -#endif() # cmake 3.1 - -if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") endif() -# Apply for Cmake less than 3.1 -if (CMAKE_VERSION VERSION_LESS "3.1") - - if ( NOT "x${CMAKE_CXX_STANDARD}" STREQUAL "x98" AND - ${CMAKE_CXX_STANDARD_DEFAULT} LESS ${CMAKE_CXX_STANDARD}) - message(STATUS "The compiler ${CMAKE_CXX_COMPILER} supports at most C++${CMAKE_CXX_STANDARD_DEFAULT}, CXX_STANDARD choice is changed.") - set(CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_DEFAULT} CACHE INTERNAL "") - endif() - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION}") -endif()#cmake<3.1 - - # Bugfix for windows/msvc systems if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") - set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION "") + set(CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION "") endif() From c9283af369280e45ccca7691e5e4dbac11eb5731 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 11:32:54 +0100 Subject: [PATCH 077/174] small fixes --- src/gsCore/gsSysInfo.cpp | 57 ++++++++++++++++++++++++++++---- src/gsCore/gsSysInfo.h | 22 ++----------- src/gsIO/gsBenchmark.cpp | 71 +++++++++++++++++++++++++--------------- src/gsIO/gsBenchmark.h | 15 ++++++--- 4 files changed, 109 insertions(+), 56 deletions(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index b444b91742..cd8afb7951 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -12,6 +12,24 @@ */ #include +#include + +#include + +#if defined(_WIN32) || defined(_WIN64) +# include +#elif __APPLE__ +# include +# include +#elif __linux__ +# include +# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) +# include +# else +# include +# endif +#elif __unix__ +#endif namespace gismo { @@ -559,7 +577,11 @@ namespace gismo return "Unknown-CPU ["+hostname+"]"; # endif + #elif __unix__ + + // No generic implementation yet + #endif return "Unknown-CPU"; @@ -567,29 +589,52 @@ namespace gismo std::string gsSysInfo::getMemoryInfo() { - + uint64_t memsize = gsSysInfo::getMemoryInBytes(); + if (memsize>0) { + if (memsize<1024) + return util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + return util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + return util::to_string(memsize/(1024*1024))+" MB"; + else + return util::to_string(memsize/(1024*1024*1024))+" GB"; + } + else + return "Unknown-Memory"; + } + + uint64_t gsSysInfo::getMemoryInBytes() + { #if defined(_WIN32) || defined(_WIN64) + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + GlobalMemoryStatusEx(&status); + return (uint64_t)status.ullTotalPhys; #elif __APPLE__ - + int64_t memsize; std::size_t size = sizeof(memsize); - + if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { - return util::to_string(memsize / 1024 / 1024)+" MB"; + return (uint64_t)memsize; } #elif __linux__ long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGE_SIZE); - return util::to_string(pages * page_size / 1024 / 1024)+" MB"; + return (uint64)(pages * page_size); #elif __unix__ + + // No generic implementation yet + #endif - return "Unknown-Memory"; + return 0; } } // namespace gismo diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h index 55984fca4a..b1db96819b 100644 --- a/src/gsCore/gsSysInfo.h +++ b/src/gsCore/gsSysInfo.h @@ -13,26 +13,7 @@ #pragma once -#include #include -#include - -#include - -#if defined(_WIN32) || defined(_WIN64) -# include -#elif __APPLE__ -# include -# include -#elif __linux__ -# include -# if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) -# include -# else -# include -# endif -#elif __unix__ -#endif namespace gismo { @@ -64,6 +45,9 @@ namespace gismo /// Returns memory information static std::string getMemoryInfo(); + + /// Returns total system memory in bytes + static uint64_t getMemoryInBytes(); }; // class gsSysInfo } // namespace gismo diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index c17d5409b9..a1296ea80b 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -11,9 +11,10 @@ Author(s): M. Moller */ -#include #include #include +#include + #include namespace gismo @@ -21,11 +22,11 @@ namespace gismo std::ostream &gsBenchmark::gsBenchmarkResultSet::print(std::ostream &os) const { - os << "\\pgfplotstableread[row sep=\\\\,col sep=&]{\n" - << "threads & " << label << " \\\\\n"; + os << "\\pgfplotstableread[col sep=space]{\n" + << label << "\n"; for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << it->at(0) << "&" << it->at(2) << "\\\\\n"; + os << it->at(2) << "\n"; os << "}\\data" << label << "\n"; @@ -38,22 +39,24 @@ namespace gismo (*it)->print(os); os << "\\begin{tikzpicture}\n" - << "\\begin{semilogyaxis}[\n" + << "\\begin{axis}[\n" << "name=MyAxis,\n" - << "width=\\textwidth,\n" - << "height=.5\\textwidth,\n" - << "legend pos=outer north east,\n" - << "symbolic x coords={"; + << "width=2\\textwidth,\n" + << "height=.8\\textwidth,\n" + << "legend pos=outer north east,\n" + << "ybar = 0.05cm,\n" + << "bar width = 3pt,\n" + << "ymajorgrids=true,\n" + << "xticklabel style={rotate=45,anchor=east},\n" + << "xticklabels={"; - //std::vector >::const_iterator - auto it = results.front()->get().cbegin(); - auto ite = results.front()->get().cend(); - for (;it!=ite; ++it) - os << it->at(0) << (it!=ite-1 ? "," : ""); + for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) + os << (*rit)->get_title() << (rit!=results.cend()-1 ? "," : ""); + os << "},\n" - << "xlabel={OpenMP threads},\n"; + << "xtick=data,\n"; - it = results.front()->get().cbegin(); + auto it = results.front()->get().cbegin(); switch( (int)it->at(3) ) { case metric::bandwidth_kb_sec: @@ -88,25 +91,39 @@ namespace gismo } os << "title={" << title << "},\n" - << "]"; + << "]\n"; - for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) - os << "\\addplot table[x=threads,y=" + for (auto rit=results.cbegin()+1; rit!=results.cend(); ++rit) + os << "\\pgfplotstablecreatecol[copy column from " + << "table={\\data" << (*rit)->get_label() - << "]{\\data" + << "}{[index] 0}] {" << (*rit)->get_label() - << "};\n"; + << "} {\\data" + << (*results.cbegin())->get_label() + << "}\n"; + + os << "\\pgfplotstabletranspose[rows/threads/.style={string type}]\\mytable{" + << "\\data" + << (*results.cbegin())->get_label() + << "}\n"; + + for (std::size_t i=1; i<=results.front()->get().size(); ++i) + os << "\\addplot table[x expr=\\coordindex, y index=" + << util::to_string(i) << "]{\\mytable};\n"; os << "\\legend{"; - for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) - os << (*rit)->get_title() << (rit!=results.cend()-1 ? "," : ""); + it = results.front()->get().cbegin(); + auto ite = results.front()->get().cend(); + for (;it!=ite; ++it) + os << "Threads=" << it->at(0) << (it!=ite-1 ? "," : ""); os << "}\n" - << "\\end{semilogyaxis}\n" + << "\\end{axis}\n" << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" - << "at ($(MyAxis.south west)+(0,-30pt)$) {%\n" + << "at ($(MyAxis.south west)+(0,-100pt)$) {%\n" << "G+Smo " << gsSysInfo::getGismoVersion() << ", Eigen " << gsSysInfo::getEigenVersion() << " (" << gsSysInfo::getCompilerVersion() @@ -117,7 +134,7 @@ namespace gismo : gsSysInfo::getExtraLibsVersion()+"), \n") << "CPU " << gsSysInfo::getCpuInfo() << ", " - << "Memory " << gsSysInfo::getMemoryInfo() << ", "; + << "Memory " << gsSysInfo::getMemoryInfo() << "\\\\\n"; gsJITCompilerConfig jit; jit.load(GISMO_CONFIG_DIR "jit.xml"); std::string flags = jit.getFlags(); @@ -141,7 +158,9 @@ namespace gismo { os << "\\documentclass[tikz]{standalone}\n" << "\\usepackage{pgfplots}\n" + << "\\usepackage{pgfplotstable}\n" << "\\usepackage{verbatim}\n" + << "\\pgfplotsset{compat=1.18}\n" << "\\begin{document}\n" << "\\usetikzlibrary{calc}\n"; diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 7b5a895faa..14ed1849de 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -147,10 +147,10 @@ typedef std::array Result; template static std::vector - run(const std::vector& nthreads, int nruns, T& benchmark, metric metric) + run(const std::vector& nthreads, index_t nruns, T& benchmark, metric metric) { gsStopwatch stopwatch; - std::size_t benchmark_result; + uint64_t benchmark_result; double benchmark_metric, benchmark_runtime; std::vector results; @@ -162,7 +162,7 @@ typedef std::array Result; benchmark_runtime = 0.0; benchmark_metric = 0.0; - for (int run=0; run Result; break; default: throw std::runtime_error("Unsupported metric"); - } - + } } + if (std::isinf(benchmark_runtime)) + benchmark_runtime = 0.0; + + if (std::isinf(benchmark_metric)) + benchmark_metric = 0.0; + Result res; res[0]= static_cast(*it); // number of OpenMP threads res[1]= benchmark_runtime/(double)nruns; // averaged elapsed time in seconds From ed618f9e353a1a597d586fc1ba7a0ef3a2754f3b Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 11:33:04 +0100 Subject: [PATCH 078/174] improved benchmark --- examples/performance_benchmark.cpp | 562 +++++++++++++++++++---------- 1 file changed, 367 insertions(+), 195 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 5508b459df..f3dfad7e0b 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -14,12 +14,46 @@ //! [Include namespace] #include -#include -#include - using namespace gismo; //! [Include namespace] +//! [Create benchmark macro] +#define CREATE_BENCHMARK(_benchmark, _label, _sizes, _metric) \ + gsInfo << "=== " << _benchmark::name() << "\n"; \ + auto bmark = benchmark.add(_label, _benchmark::name()); \ + for (auto it=_sizes.cbegin(); it!=_sizes.cend(); ++it) { \ + gsInfo << "... " << (*it) << std::flush; \ + try { \ + _benchmark benchmark(*it); \ + auto results = gsBenchmark::run(nthreads, nruns, benchmark, _metric); \ + std::string meminfo; \ + uint64_t memsize = benchmark.size(); \ + if (memsize<1024) \ + meminfo = util::to_string(memsize)+" B"; \ + else if (memsize<1024*1024) \ + meminfo = util::to_string(memsize/1024)+" KB"; \ + else if (memsize<1024*1024*1024) \ + meminfo = util::to_string(memsize/(1024*1024))+" MB"; \ + else \ + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; \ + bmark->add(_label, meminfo, results); \ + } catch(...) { gsInfo << "[failed!]"; } \ + gsInfo << "\n"; \ + } +//! [Create benchmark macro] + +//! [Implement memory safeguard] +template +class memory_safeguard +{ +public: + memory_safeguard(index_t n) + { + if (T::size(n) > 0.8*gsSysInfo::getMemoryInBytes()) + throw std::runtime_error("Insufficient memory"); + } +}; +//! [Implement memory safeguard] //! [Implement benchmarks] /** @@ -29,15 +63,16 @@ template class benchmark_c_array_memcopy { private: - std::size_t n; + memory_safeguard _msg; + index_t n; T *m_x, *m_y; public: - benchmark_c_array_memcopy(std::size_t n) - : n(n), m_x(new T[n]), m_y(new T[n]) + benchmark_c_array_memcopy(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]) { #pragma omp parallel for simd - for (std::size_t i=0; i class benchmark_c_array_dotproduct { private: - std::size_t n; + memory_safeguard _msg; + index_t n; T *m_x, *m_y; public: - benchmark_c_array_dotproduct(std::size_t n) - : n(n), m_x(new T[n]), m_y(new T[n]) + benchmark_c_array_dotproduct(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]) { #pragma omp parallel for simd - for (std::size_t i=0; i class benchmark_c_array_axpy { private: - std::size_t n; + memory_safeguard _msg; + index_t n; T *m_x, *m_y, *m_z; public: - benchmark_c_array_axpy(std::size_t n) - : n(n), m_x(new T[n]), m_y(new T[n]), m_z(new T[n]) + benchmark_c_array_axpy(index_t n) + : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]), m_z(new T[n]) { #pragma omp parallel for simd - for (std::size_t i=0; i class benchmark_c_array_dense_matmul { private: - std::size_t n; + memory_safeguard _msg; + index_t n; T *m_A, *m_x, *m_y; public: - benchmark_c_array_dense_matmul(std::size_t n) - : n(n), m_A(new T[n*n]), m_x(new T[n]), m_y(new T[n]) + benchmark_c_array_dense_matmul(index_t n) + : _msg(n), n(n), m_A(new T[n*n]), m_x(new T[n]), m_y(new T[n]) { #pragma omp parallel for simd - for (std::size_t i=0; i -class benchmark_eigen_vector_memcopy +class benchmark_eigen_memcopy { private: - std::size_t n; + memory_safeguard _msg; + index_t n; gsVector x,y; public: - benchmark_eigen_vector_memcopy(std::size_t n) - : n(n), x(n), y(n) + benchmark_eigen_memcopy(index_t n) + : _msg(n), n(n), x(n), y(n) { x.fill((T)0.0); } - std::size_t operator()() + index_t operator()() { y.noalias() = x; @@ -221,7 +321,22 @@ class benchmark_eigen_vector_memcopy T tmp = y[n-1]; GISMO_UNUSED(tmp); - return sizeof(T) * 2 * n; + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (2 * n * sizeof(T)); + } + + static std::string name() + { + return "Memory copy (gsVector)"; } }; @@ -229,26 +344,42 @@ class benchmark_eigen_vector_memcopy * Benchmark: Eigen vector dot-product */ template -class benchmark_eigen_vector_dotproduct +class benchmark_eigen_dotproduct { private: - std::size_t n; + memory_safeguard _msg; + index_t n; gsVector x, y; public: - benchmark_eigen_vector_dotproduct(std::size_t n) - : n(n), x(n), y(n) + benchmark_eigen_dotproduct(index_t n) + : _msg(n), n(n), x(n), y(n) { x.fill((T)0.0); y.fill((T)0.0); } - std::size_t operator()() + index_t operator()() { volatile T sum = y.dot(x); GISMO_UNUSED(sum); - return sizeof(T) * 2 * n; + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (2 * n * sizeof(T)); + } + + static std::string name() + { + return "Dot-product (gsVector)"; } }; @@ -256,21 +387,22 @@ class benchmark_eigen_vector_dotproduct * Benchmark: Eigen vector AXPY */ template -class benchmark_eigen_vector_axpy +class benchmark_eigen_axpy { private: - std::size_t n; + memory_safeguard _msg; + index_t n; gsVector x, y, z; public: - benchmark_eigen_vector_axpy(std::size_t n) - : n(n), x(n), y(n), z(n) + benchmark_eigen_axpy(index_t n) + : _msg(n), n(n), x(n), y(n), z(n) { x.fill((T)0.0); y.fill((T)0.0); } - std::size_t operator()() + index_t operator()() { z.noalias() = (T)3.141*x + y; @@ -278,7 +410,22 @@ class benchmark_eigen_vector_axpy T tmp = z[n-1]; GISMO_UNUSED(tmp); - return sizeof(T) * 3 * n; + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (3 * n * sizeof(T)); + } + + static std::string name() + { + return "AXPY (gsVector)"; } }; @@ -286,22 +433,23 @@ class benchmark_eigen_vector_axpy * Benchmark: Eigen dense matrix-vector multiplication */ template -class benchmark_eigen_vector_dense_matmul +class benchmark_eigen_dense_matmul { private: - std::size_t n; + memory_safeguard _msg; + index_t n; gsMatrix A; gsVector x, y; public: - benchmark_eigen_vector_dense_matmul(std::size_t n) - : n(n), A(n,n), x(n), y(n) + benchmark_eigen_dense_matmul(index_t n) + : _msg(n), n(n), A(n,n), x(n), y(n) { A.fill(0.0); x.fill(0.0); } - std::size_t operator()() + index_t operator()() { y.noalias() = A*x; @@ -309,25 +457,100 @@ class benchmark_eigen_vector_dense_matmul T tmp = y[n-1]; GISMO_UNUSED(tmp); - return sizeof(T) * (2*n*n + n); + return size(); + } + + constexpr uint64_t size() const + { + return size(n); + } + + static constexpr uint64_t size(index_t n) + { + return (2 * n * n + n) * sizeof(T); + } + + static std::string name() + { + return "Dense matrix-vector multiplication (gsMatrix/gsVector)"; + } +}; + +/** + * Benchmark: Poisson 2D + */ +template +class benchmark_poisson2d_visitor +{ +private: + memory_safeguard _msg; + gsMultiPatch geo; + gsMultiBasis bases; + gsConstantFunction f; + gsBoundaryConditions bcInfo; + gsPoissonAssembler assembler; + +public: + benchmark_poisson2d_visitor(int npatches, int refine=0, int degree=1) + : _msg(0), geo(gsNurbsCreator<>::BSplineSquareGrid(npatches, npatches, 1.0)), + bases(geo), f(0.0, 0.0, 2) + { + // h-refine each basis + for (int i = 0; i < refine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + assembler = gsPoissonAssembler(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); + } + + index_t operator()() + { + assembler.assemble(); + + return sizeof(T) * assembler.numDofs(); + } + + constexpr uint64_t size() const + { + return size(0); + } + + static constexpr uint64_t size(index_t n) + { + return sizeof(T); + } + + static std::string name() + { + return "Visitor-based Poisson2d"; } }; //! [Implement benchmarks] + + + int main(int argc, char *argv[]) { //! [Parse command line] gsBenchmark benchmark; - std::vector nthreads, ssizes, dsizes, vsizes; + std::vector benchmarks, nthreads, msizes, vsizes; std::string fn; - int nruns=1; + index_t nruns = 1, + msizesmax = (index_t) std::sqrt(real_t(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()), + vsizesmax = (index_t) (real_t(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()); gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); - cmd.addMultiInt("d", "dsizes", "Number of unknowns in dense matrix benchmarks", dsizes); - cmd.addMultiInt("s", "ssizes", "Number of unknowns in sparse matrix benchmarks", ssizes); + cmd.addInt("M", "msizesmax", "Maximum number of unknowns in matrix/vector benchmarks (automated generation of sequence)", msizesmax); + cmd.addInt("V", "vsizesmax", "Maximum number of unknowns in vector benchmarks (automated generation of sequence)", vsizesmax); + cmd.addMultiInt("b", "benchmarks", "List of benchmarks to be run", benchmarks); + cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks", msizes); cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks", vsizes); cmd.addString("o", "output", "Name of the output file", fn); @@ -336,166 +559,115 @@ int main(int argc, char *argv[]) //! [Parse command line] //! [Default configuration] + // If empty fill with all benchmarks 1, ..., 5 + if (benchmarks.empty()) { + for(index_t i=1; i<=9; ++i) + benchmarks.push_back(i); + } + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { - for(int i=1; i<=omp_get_max_threads(); i*=2) + for(index_t i=1; i<=omp_get_max_threads(); i*=2) nthreads.push_back(i); } - // If empty fill with 10, 100, 1.000, 10.000 - if (dsizes.empty()) { - dsizes.push_back(1e1); - dsizes.push_back(1e2); - dsizes.push_back(1e3); - dsizes.push_back(1e4); - } - - // If empty fill with 100, 1.000, 10.000, 100.000, 1.000.000 - if (ssizes.empty()) { - ssizes.push_back(1e2); - ssizes.push_back(1e3); - ssizes.push_back(1e4); - ssizes.push_back(1e5); - ssizes.push_back(1e6); + // If empty fill with 10, 100, 1.000, ..., 80% of Sqrt(total system memory) + if (msizes.empty()) { + for(index_t i=10;;) { + msizes.push_back(i); + if (i<=std::min(msizesmax, std::numeric_limits::max()) / 64) + i*=8; + else + break; + } } - // If empty fill with 100, 1.000, 10.000, 100.000, 1.000.000 + // If empty fill with 100, 1.000, 10.000, ... 80% of total system memory if (vsizes.empty()) { - vsizes.push_back(1e2); - vsizes.push_back(1e3); - vsizes.push_back(1e4); - vsizes.push_back(1e5); - vsizes.push_back(1e6); - vsizes.push_back(1e7); - vsizes.push_back(1e8); + for(index_t i=100;;) { + vsizes.push_back(i); + if (i<=std::min(vsizesmax, std::numeric_limits::max()) / 8) + i*=8; + else + break; + } } + //! [Default configuration] //! [Execute benchmarks] - { - auto bm = benchmark.add("memcopy", "memory copy"); - { - gsInfo << "=== Native C array memcopy\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_c_array_memcopy benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("nativememcopy", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + for (auto bit=benchmarks.cbegin(); bit!=benchmarks.cend(); ++bit) { + switch((index_t)(*bit)) { + + case (1): { + // Benchmark: memcopy native C arrays + CREATE_BENCHMARK(benchmark_c_array_memcopy, "memcopyCarray", + vsizes, metric::bandwidth_gb_sec); + break; } - { - gsInfo << "=== gsVector memcopy\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_eigen_vector_memcopy benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("eigenmemcopy", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (2): { + // Benchmark: memcopy gsVector + CREATE_BENCHMARK(benchmark_eigen_memcopy, "memcopyEigen", + vsizes, metric::bandwidth_gb_sec); + break; } - } - { - auto bm = benchmark.add("dotprod", "dot-product"); - { - gsInfo << "=== Native C array dot-product\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_c_array_dotproduct benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("nativedotproduct", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (3): { + // Benchmark: dot-product native C array + CREATE_BENCHMARK(benchmark_c_array_dotproduct, "dotproductCarray", + vsizes, metric::bandwidth_gb_sec); + break; } - { - gsInfo << "=== gsVector dot-product\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_eigen_vector_dotproduct benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("eigendotproduct", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (4): { + // Benchmark: dot-product gsVector + CREATE_BENCHMARK(benchmark_eigen_dotproduct, "dotproductEigen", + vsizes, metric::bandwidth_gb_sec); + break; } - } - { - auto bm = benchmark.add("axpy", "axpy"); - { - gsInfo << "=== Native C array AXPY\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_c_array_axpy benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("nativeaxpy", - "native("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (5): { + // Benchmark: axpy native C array + CREATE_BENCHMARK(benchmark_c_array_axpy, "axpyCarray", + vsizes, metric::bandwidth_gb_sec); + break; } - { - gsInfo << "=== gsVector AXPY\n"; - for (auto it=vsizes.cbegin(); it!=vsizes.cend(); ++it) { - gsInfo << (*it) << (it!=vsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_eigen_vector_axpy benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("eigenaxpy", - "eigen("+util::to_string(sizeof(double)*(double)*it / 1024 / 1024, 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (6): { + // Benchmark: axpy gsVector + CREATE_BENCHMARK(benchmark_eigen_axpy, "axpyEigen", + vsizes, metric::bandwidth_gb_sec); + break; } - } - { - auto bm = benchmark.add("densemvmul", "Dense matrix-vector multiply"); - { - gsInfo << "=== Native C array dense matrix-vector multiplication\n"; - for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { - gsInfo << (*it) << (it!=dsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_c_array_dense_matmul benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("nativdensemvmul", - "native("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (7): { + // Benchmark: dense matrix-vector multiplication native C array + CREATE_BENCHMARK(benchmark_c_array_dense_matmul, "densematmulCarray", + msizes, metric::bandwidth_gb_sec); + break; } - { - gsInfo << "=== gsMatrix/gsVector dense matrix-vector multiplication\n"; - for (auto it=dsizes.cbegin(); it!=dsizes.cend(); ++it) { - gsInfo << (*it) << (it!=dsizes.cend()-1 ? "..." : "\n") << std::flush; - try { - benchmark_eigen_vector_dense_matmul benchmark(*it); - auto results = gsBenchmark::run(nthreads, nruns, benchmark, metric::bandwidth_gb_sec); - bm->add("eigenmvmul", - "eigen("+util::to_string(std::pow(sizeof(double)*(double)*it / 1024 / 1024, 2), 0)+" MB)", - results); - } catch(...) { gsInfo << "failed!"; } - } + case (8): { + // Benchmark: dense matrix-vector multiplication gsMatrix/gsVector + CREATE_BENCHMARK(benchmark_eigen_dense_matmul, "densematmulEigen", + msizes, metric::bandwidth_gb_sec); + break; } - } - + + case (9): { + // Benchmark: visitor-based Poisson 2D assembly + CREATE_BENCHMARK(benchmark_poisson2d_visitor, "assemblerVisitor", + vsizes, metric::bandwidth_gb_sec); + break; + } + + default: + throw std::runtime_error("Invalid benchmark"); + } + + } // benchmark loop + if (fn.empty()) gsInfo << benchmark << "\n"; else { From 2bb725756f0559a4c9e7166818f1182ab3bfcda6 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 12:49:33 +0100 Subject: [PATCH 079/174] small fixes --- src/gsAssembler/gsPatchRule.hpp | 4 ++-- src/gsMatrix/gsAsMatrix.h | 4 ++-- src/gsNurbs/gsBoehm.hpp | 4 ++-- src/gsNurbs/gsDeboor.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/gsAssembler/gsPatchRule.hpp b/src/gsAssembler/gsPatchRule.hpp index 0c53c4e349..7f615989c1 100644 --- a/src/gsAssembler/gsPatchRule.hpp +++ b/src/gsAssembler/gsPatchRule.hpp @@ -37,7 +37,7 @@ gsPatchRule::gsPatchRule(const gsBasis & basis, // Initialize some stuff m_dim = m_basis->dim(); - GISMO_ASSERT( m_fixDir < short_t(m_dim) && m_fixDir>-2, "Invalid input fixDir = "<-2, "Invalid input fixDir = "<::gsPatchRule(const gsBasis & basis, for (size_t d = 0; d != m_dim; d++) { m_end = m_basis->support().col(1); - if (short_t(d)==m_fixDir && m_fixDir!=-1) + if ((short_t)(d)==m_fixDir && m_fixDir!=-1) { m_nodes[d].resize(2); m_nodes[d]<<0,1; diff --git a/src/gsMatrix/gsAsMatrix.h b/src/gsMatrix/gsAsMatrix.h index e468a0e52c..2cab74c58d 100644 --- a/src/gsMatrix/gsAsMatrix.h +++ b/src/gsMatrix/gsAsMatrix.h @@ -53,7 +53,7 @@ class gsAsMatrix : public Eigen::Map< Eigen::Matrix > : Base( v.data(), n, m) { //GISMO_ASSERT( v.size() != 0, "Tried to map an empty vector." ); - GISMO_ASSERT( m*n <= index_t(v.size()), "Not enough coefficients in vector to map." ); + GISMO_ASSERT( m*n <= (index_t)(v.size()), "Not enough coefficients in vector to map." ); } gsAsMatrix( std::vector & v) @@ -162,7 +162,7 @@ class gsAsConstMatrix : public Eigen::Map< const Eigen::Matrix > gsAsConstMatrix( const std::vector & v, index_t n, index_t m) : Base( v.data(), n, m) { - GISMO_ASSERT( m*n <= index_t(v.size()), "Not enough coefficients in vector to map." ); + GISMO_ASSERT( m*n <= (index_t)(v.size()), "Not enough coefficients in vector to map." ); } gsAsConstMatrix( const std::vector & v) diff --git a/src/gsNurbs/gsBoehm.hpp b/src/gsNurbs/gsBoehm.hpp index 3351f9e427..637699025f 100644 --- a/src/gsNurbs/gsBoehm.hpp +++ b/src/gsNurbs/gsBoehm.hpp @@ -39,7 +39,7 @@ void gsBoehm( if (r==1) return gsBoehmSingle(knots, coefs, val, update_knots); - GISMO_ASSERT( coefs.rows() == index_t(knots.size() - knots.degree()-1), + GISMO_ASSERT( coefs.rows() == (index_t)(knots.size() - knots.degree()-1), "Incompatible coefficients("< Date: Thu, 9 Dec 2021 12:49:58 +0100 Subject: [PATCH 080/174] allow different values for index_t and short_t --- cmake/gsConfig.cmake | 13 +++++++++++-- cmake/gsOptions.cmake | 1 + src/gsCore/gsConfig.h.in | 10 ++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/cmake/gsConfig.cmake b/cmake/gsConfig.cmake index 35ef1f2e8d..6ab54828a5 100644 --- a/cmake/gsConfig.cmake +++ b/cmake/gsConfig.cmake @@ -50,9 +50,18 @@ if(NOT GISMO_INDEX_TYPE) set (GISMO_INDEX_TYPE "int" CACHE STRING #math(EXPR BITSZ_VOID_P "8*${CMAKE_SIZEOF_VOID_P}") #set (GISMO_INDEX_TYPE "int${BITSZ_VOID_P}_t" CACHE STRING - "Index type(int, int32_t, int64_t, long, long long)" FORCE) + "Index type(int, int8_t, int16_t, int32_t, int64_t, long, long long)" FORCE) set_property(CACHE GISMO_INDEX_TYPE PROPERTY STRINGS - "int" "int32_t" "int64_t" "long" "long long" ) + "int" "int8_t" "int16_t" "int32_t" "int64_t" "long" "long long" ) +endif() + +if(NOT GISMO_SHORT_TYPE) + set (GISMO_SHORT_TYPE "int" CACHE STRING + #math(EXPR BITSZ_VOID_P "8*${CMAKE_SIZEOF_VOID_P}") + #set (GISMO_INDEX_TYPE "int${BITSZ_VOID_P}_t" CACHE STRING + "Short type(int, int8_8, int16_t, int32_t, int64_t, long, long long)" FORCE) + set_property(CACHE GISMO_SHORT_TYPE PROPERTY STRINGS + "int" "int8_t" "int16_t" "int32_t" "int64_t" "long" "long long" ) endif() set(${PROJECT_NAME}_ARCHIVE_OUTPUT_DIRECTORY lib) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index 752616a0ea..ca4a563f88 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -27,6 +27,7 @@ message (" CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}") message (" GISMO_COEFF_TYPE ${GISMO_COEFF_TYPE}") message (" GISMO_INDEX_TYPE ${GISMO_INDEX_TYPE}") +message (" GISMO_SHORT_TYPE ${GISMO_SHORT_TYPE}") ## ################################################################# ## Options list: Standard options diff --git a/src/gsCore/gsConfig.h.in b/src/gsCore/gsConfig.h.in index c935ce363f..ed4e21315f 100644 --- a/src/gsCore/gsConfig.h.in +++ b/src/gsCore/gsConfig.h.in @@ -29,10 +29,16 @@ #endif /** Define default index type. */ -#define index_t @GISMO_INDEX_TYPE@ +#define GISMO_INDEX_TYPE @GISMO_INDEX_TYPE@ +#ifndef index_t +#define index_t GISMO_INDEX_TYPE +#endif /** Define default dimension type. */ -#define short_t @GISMO_INDEX_TYPE@ //short +#define GISMO_SHORT_TYPE @GISMO_SHORT_TYPE@ +#ifndef short_t +#define short_t GISMO_INDEX_TYPE +#endif /** Define the file data directory. */ #define GISMO_DATA_DIR "@GISMO_DATA_DIR@" From f85fe908e9bde69b54317f371bb7008e2b38e0d0 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 12:50:18 +0100 Subject: [PATCH 081/174] small fixes --- src/gsCore/gsSysInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index cd8afb7951..9bfece3f07 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -626,7 +626,7 @@ namespace gismo long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGE_SIZE); - return (uint64)(pages * page_size); + return (uint64_t)(pages * page_size); #elif __unix__ From 1f7db1ee5bf5d4a4ef464a1aa703f3852c4f06d5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 12:50:31 +0100 Subject: [PATCH 082/174] improved benchmark --- examples/performance_benchmark.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index f3dfad7e0b..6066e790cd 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -49,7 +49,7 @@ class memory_safeguard public: memory_safeguard(index_t n) { - if (T::size(n) > 0.8*gsSysInfo::getMemoryInBytes()) + if (T::size(n) > (real_t)(0.8)*gsSysInfo::getMemoryInBytes()) throw std::runtime_error("Insufficient memory"); } }; @@ -539,9 +539,11 @@ int main(int argc, char *argv[]) gsBenchmark benchmark; std::vector benchmarks, nthreads, msizes, vsizes; std::string fn; - index_t nruns = 1, - msizesmax = (index_t) std::sqrt(real_t(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()), - vsizesmax = (index_t) (real_t(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()); + index_t nruns = 1; + index_t msizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), + std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); + index_t vsizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), + (real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()); gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); From fb4b1429cfb63bbfd86700dd64d0c4859008e78f Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 12:51:12 +0100 Subject: [PATCH 083/174] small improvements of benchmark --- examples/performance_benchmark.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index f3dfad7e0b..79cbb03aa1 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -49,7 +49,7 @@ class memory_safeguard public: memory_safeguard(index_t n) { - if (T::size(n) > 0.8*gsSysInfo::getMemoryInBytes()) + if (T::size(n) > gsSysInfo::getMemoryInBytes()) throw std::runtime_error("Insufficient memory"); } }; @@ -119,7 +119,7 @@ class benchmark_c_array_dotproduct { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; T *m_x, *m_y; public: @@ -178,7 +178,7 @@ class benchmark_c_array_axpy { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; T *m_x, *m_y, *m_z; public: @@ -238,7 +238,7 @@ class benchmark_c_array_dense_matmul { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; T *m_A, *m_x, *m_y; public: @@ -303,7 +303,7 @@ class benchmark_eigen_memcopy { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; gsVector x,y; public: @@ -348,7 +348,7 @@ class benchmark_eigen_dotproduct { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; gsVector x, y; public: @@ -391,7 +391,7 @@ class benchmark_eigen_axpy { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; gsVector x, y, z; public: @@ -437,7 +437,7 @@ class benchmark_eigen_dense_matmul { private: memory_safeguard _msg; - index_t n; + constexpr index_t n; gsMatrix A; gsVector x, y; From bfafdff49f00ced68c3625e03b00d0689e2710b2 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 16:21:12 +0100 Subject: [PATCH 084/174] small fixes --- src/gsIO/gsBenchmark.cpp | 2 +- src/gsIO/gsBenchmark.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index a1296ea80b..0cf2a420e7 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -87,7 +87,7 @@ namespace gismo os << "ylabel={Runtime in seconds},\n"; break; default: - throw std::runtime_error("Unsupported metric"); + GISMO_ERROR("Unsupported metric"); } os << "title={" << title << "},\n" diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 14ed1849de..ae63f4f90e 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -148,7 +148,7 @@ typedef std::array Result; template static std::vector run(const std::vector& nthreads, index_t nruns, T& benchmark, metric metric) - { +{ gsStopwatch stopwatch; uint64_t benchmark_result; double benchmark_metric, benchmark_runtime; @@ -185,7 +185,7 @@ typedef std::array Result; benchmark_metric += stopwatch.elapsed(); break; default: - throw std::runtime_error("Unsupported metric"); + GISMO_ERROR("Unsupported metric"); } } From 689abfcfbc337450f76e9174052868c59f55d975 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 9 Dec 2021 16:21:26 +0100 Subject: [PATCH 085/174] fix benchmark --- examples/performance_benchmark.cpp | 107 ++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 32 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 48add74b04..c3b19abf5f 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -17,15 +17,16 @@ using namespace gismo; //! [Include namespace] -//! [Create benchmark macro] +//! [Implement benchmark macro] #define CREATE_BENCHMARK(_benchmark, _label, _sizes, _metric) \ gsInfo << "=== " << _benchmark::name() << "\n"; \ auto bmark = benchmark.add(_label, _benchmark::name()); \ + auto riter = nruns.cbegin(); \ for (auto it=_sizes.cbegin(); it!=_sizes.cend(); ++it) { \ - gsInfo << "... " << (*it) << std::flush; \ + gsInfo << "... " << (*it) << "(" << *riter << ")"<< std::flush; \ try { \ _benchmark benchmark(*it); \ - auto results = gsBenchmark::run(nthreads, nruns, benchmark, _metric); \ + auto results = gsBenchmark::run(nthreads, *riter++, benchmark, _metric); \ std::string meminfo; \ uint64_t memsize = benchmark.size(); \ if (memsize<1024) \ @@ -40,7 +41,7 @@ using namespace gismo; } catch(...) { gsInfo << "[failed!]"; } \ gsInfo << "\n"; \ } -//! [Create benchmark macro] +//! [Implement benchmark macro] //! [Implement memory safeguard] template @@ -50,7 +51,7 @@ class memory_safeguard memory_safeguard(index_t n) { if (T::size(n) > gsSysInfo::getMemoryInBytes()) - throw std::runtime_error("Insufficient memory"); + GISMO_ERROR("Insufficient memory"); } }; //! [Implement memory safeguard] @@ -119,7 +120,7 @@ class benchmark_c_array_dotproduct { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; T *m_x, *m_y; public: @@ -178,7 +179,7 @@ class benchmark_c_array_axpy { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; T *m_x, *m_y, *m_z; public: @@ -238,7 +239,7 @@ class benchmark_c_array_dense_matmul { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; T *m_A, *m_x, *m_y; public: @@ -303,7 +304,7 @@ class benchmark_eigen_memcopy { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; gsVector x,y; public: @@ -348,7 +349,7 @@ class benchmark_eigen_dotproduct { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; gsVector x, y; public: @@ -391,7 +392,7 @@ class benchmark_eigen_axpy { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; gsVector x, y, z; public: @@ -437,7 +438,7 @@ class benchmark_eigen_dense_matmul { private: memory_safeguard _msg; - constexpr index_t n; + const index_t n; gsMatrix A; gsVector x, y; @@ -537,9 +538,16 @@ int main(int argc, char *argv[]) { //! [Parse command line] gsBenchmark benchmark; - std::vector benchmarks, nthreads, msizes, vsizes; std::string fn; - index_t nruns = 1; + bool list=false; + std::vector benchmarks, nruns, nthreads, msizes, vsizes; + real_t nrunsfactor = 1.5; + real_t msizesfactor = 10; + real_t vsizesfactor = 10; + index_t nrunsmax = 50; + index_t nrunsmin = 1; + index_t msizesmin = 10; + index_t vsizesmin = 100; index_t msizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); index_t vsizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), @@ -548,53 +556,88 @@ int main(int argc, char *argv[]) gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); - cmd.addInt("r", "runs", "Number of runs over which the results are averaged", nruns); - cmd.addInt("M", "msizesmax", "Maximum number of unknowns in matrix/vector benchmarks (automated generation of sequence)", msizesmax); - cmd.addInt("V", "vsizesmax", "Maximum number of unknowns in vector benchmarks (automated generation of sequence)", vsizesmax); + cmd.addReal("M", "msizesfactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizesfactor); + cmd.addReal("V", "vsizesfactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizesfactor); + cmd.addReal("R", "runsfactor", "Growth factor for the sequence of runs (only used if '-r' is not given)", nrunsfactor); + cmd.addInt("", "msizesmax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizesmax); + cmd.addInt("", "msizesmin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizesmin); + cmd.addInt("", "vsizesmax", "Maximum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmax); + cmd.addInt("", "vsizesmin", "Mminimum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmin); + cmd.addInt("", "runsmax", "Maximum number of runs (only used if '-r' is not given)", nrunsmax); + cmd.addInt("", "runsmin", "Mminimum number of runs (only used if '-r' is not given)", nrunsmin); cmd.addMultiInt("b", "benchmarks", "List of benchmarks to be run", benchmarks); - cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks", msizes); - cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark", nthreads); - cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks", vsizes); + cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks (auto-generated if not given)", msizes); + cmd.addMultiInt("r", "runs", "Number of runs over which the results are averaged (auto-generated if not given)", nruns); + cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark (auto-generated if not given)", nthreads); + cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks (auto-generated if not given)", vsizes); cmd.addString("o", "output", "Name of the output file", fn); - + cmd.addSwitch("list", "List all benchmarks and exit", list); + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } //! [Parse command line] + //! [List benchmarks and exit] + if (list) { + gsInfo << "\nThe following benchmarks are available:\n" + << "#1: " << benchmark_c_array_memcopy::name() << "\n" + << "#2: " << benchmark_eigen_memcopy::name() << "\n" + << "#3: " << benchmark_c_array_dotproduct::name() << "\n" + << "#4: " << benchmark_eigen_dotproduct::name() << "\n" + << "#5: " << benchmark_c_array_axpy::name() << "\n" + << "#6: " << benchmark_eigen_axpy::name() << "\n" + << "#7: " << benchmark_c_array_dense_matmul::name() << "\n" + << "#8: " << benchmark_eigen_dense_matmul::name() << "\n" + << "#9: " << benchmark_poisson2d_visitor::name() << "\n"; + return EXIT_SUCCESS; + } + //! [List benchmarks and exit] + //! [Default configuration] // If empty fill with all benchmarks 1, ..., 5 if (benchmarks.empty()) { for(index_t i=1; i<=9; ++i) benchmarks.push_back(i); } - + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(index_t i=1; i<=omp_get_max_threads(); i*=2) nthreads.push_back(i); } - // If empty fill with 10, 100, 1.000, ..., 80% of Sqrt(total system memory) + // If empty fill with msizesmin*msizesfactor^k, k=0, 1, 2, ..., msizesmax if (msizes.empty()) { - for(index_t i=10;;) { + for(index_t i=msizesmin;;) { msizes.push_back(i); - if (i<=std::min(msizesmax, std::numeric_limits::max()) / 64) - i*=8; + if (i<=std::min(msizesmax, std::numeric_limits::max()) / (msizesfactor*msizesfactor)) + i*=msizesfactor; else break; } } - // If empty fill with 100, 1.000, 10.000, ... 80% of total system memory + // If empty fill with vsizesmin*vsizesfactor^k, k=0, 1, 2, ..., vsizesmax if (vsizes.empty()) { - for(index_t i=100;;) { + for(index_t i=vsizesmin;;) { vsizes.push_back(i); - if (i<=std::min(vsizesmax, std::numeric_limits::max()) / 8) - i*=8; + if (i<=std::min(vsizesmax, std::numeric_limits::max()) / vsizesfactor) + i*=vsizesfactor; else break; } } + // If empty fill with nrunsmax/nrunsfactor^k, k=0, 1, 2, ..., nrunsmin + if (nruns.empty()) { + index_t k = nrunsmax; + for(index_t i=0; i<(index_t)std::max(msizes.size(),vsizes.size()); ++i) { + nruns.push_back(k); + k = std::max(nrunsmin, (index_t)(k/nrunsfactor)); + } + } + + if (nruns.size() Date: Thu, 9 Dec 2021 19:23:34 +0100 Subject: [PATCH 086/174] dox documentation of performance benchmark --- doc/Examples.dox | 4 + doc/figs/performance_benchmark_memcopy1.pdf | Bin 0 -> 40795 bytes doc/figs/performance_benchmark_memcopy2.pdf | Bin 0 -> 40605 bytes doc/heatEquation2_example.dox | 14 ++ doc/performance_benchmark.dox | 196 ++++++++++++++++++++ examples/performance_benchmark.cpp | 23 ++- 6 files changed, 232 insertions(+), 5 deletions(-) create mode 100644 doc/figs/performance_benchmark_memcopy1.pdf create mode 100644 doc/figs/performance_benchmark_memcopy2.pdf create mode 100644 doc/heatEquation2_example.dox create mode 100644 doc/performance_benchmark.dox diff --git a/doc/Examples.dox b/doc/Examples.dox index e0cd177a7f..2f65625fbf 100644 --- a/doc/Examples.dox +++ b/doc/Examples.dox @@ -32,6 +32,8 @@ In the gismo/examples sub-directory we find: - \subpage heatEquation_example +- \subpage heatEquation2_example + - \subpage inputOutput_example - \subpage gsInterpolateMap @@ -52,6 +54,8 @@ In the gismo/examples sub-directory we find: - \subpage multiGrid_example +- \subpage performance_benchmark + - \subpage poisson_example - \subpage poisson2_example diff --git a/doc/figs/performance_benchmark_memcopy1.pdf b/doc/figs/performance_benchmark_memcopy1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..afd330a5b5df9e356b05c377d2f631d24550f8ee GIT binary patch literal 40795 zcma&LL$ELm%%yp4+qP{R@3n2)wr$(CZQHhOtH18*e-<;F$tI|?IN3dwkjjgQ(K6Dp zLXpldkE}zn5ik(g8CpW|@IcZ3GqE*uHYZ?WXJ#PyKLJHAW?}7Y;z&R*W^LeXB4T1> zXKVt+#|P!)>}X7nV<#{&UDG8K)y-e8Oc7k_LZT3Ze} zPAZC&cv3favGwom-yEGNk_3SO5oe2;<)yNUn%WD^%`EpOF8(}kAExhOtji#hiOd)U zd3NRP^W(k|_`CypJa{sMnaW$oli|0Q(zh`P)#Bgd-O?0{Cv#U>Cx6B~Irg`mY)qR$ z+@*KfQ845hrEu_ymrdNC7$vEghuyp07xehd;91U1DG)N@5ueo%cVl}s^;?8>^JW>8IlbaNI3 zUNOMmCTs*tFz0(J=U9?ckzJ3K&#@<6ILbc|=sgE&q!kpZEafgW1NudZda#i2q~YEEhQPT<6TE4p|`ap#IC;l=yfmUWFYUjM8*86Os1-qf=>y`WDn?XSJfm)%8WK7%8nFH?)1CqtHQnjmWF*xj9Ses?Ehk zn1?X^;W1Bgv;CkrFUp8j1M;z3DcGyUy58Pw>#L(xUJ8~-483B7au{eRy<8lgzkrTe z#IlZC*moVb@%Vonxjn1$NyxKJRe94klsQpN*EZ;?otsmQDH(>>Apa@Qu)`E=u|bU ztuxF)NTt(TN0?KP?t@H-|CvM6I`v}2t)xYNg*3Oc42PRSZc(^on}Lx^mr$a6$j&cY zVN<}sYz}`t7cC!zqYg*7#zVGmdPKl_wKWZ@4Vgq1z(g)z@EEBOU@bDf&^OPDB~%`d z#SVnrH*ogG7ERg1U!c7V@nH;LBAZuW=0jc%mSbtbx3OUoQYe&AK7ojneg%y7hQ8{e zkZv%drD*_}Oi%Z6maPa~NlvW!F~6phf}<+;#UV|$iB{*vB$cR)kq#CUh2<+M>3Y}y zM-bZ~j+7{2Pvy!il`arLKUyxovahV(m4uRMPxnqNiK%Q{>X!81E|#3KMU`k33!6E` z3|kZ*<0~k#aPPM($&3NX>e0#tm7-v!AvWG3LHux=>lK3))@X|4AJJJ`1l^l$>}rIXtc@Av!XhY zUKLDEYVQ?tkLG_0`h%FV0{Gy5aeh78U!iDYyR!B%9Eb58C(aHqPPU8P-XWgrt=~dB^^}cL9Ws zL6WJvLUE{;iPBbtt+Tq(x;}UH+Z}$#8EMf=qU6G{2`o{b>(X(IXv&8qGXY-Ra-BrL8VRK=Thh=fwtLQ#QbM8%Dzqa&e!(>X_i#r(Hh#K zO(p7`m^V;vQ^NQU)dBS5G1gbBB@wEwS|JIMi@YO(FhNrY7kOrTMQrid%9=V;VQE??BC#XAizfBtJ?yQa4pG5% zpMjTYSL(zb6DN8l2FCgwJ+|nfVj3Ta`3%eAM2;|c8c$5ah7+W#lZ8qJEZQ@2V^nM) zJC1Vd)}^t79|QXE%GtC!G1!^vX|qdUMXs%q;8W z#-%D%;+Z(=)BppN^pfj#E5zOO)&F(r!vViRFfw1BeufV+MKIt-?rN$ki$3Ap|eB7;{siOz1lB_$JEm~72*x_i2X?G{Yj+0a~@?Q z?01ilvf)4@$TMaKZ6h==x<>|s(9v+E?y1ENy#Cso`{L^6NAIl39;YK~my(oEsKPd`dbz$Aur+%ZVd=S8f#mvX)$)8-TGAMx;IB!U%nn3t zML-1ZA0L|Z=<5uLW8o(Itp4a;kLuXPukUeqxwyAS=RgrfVtu|J=K4mU<8g02U#>M2 z#TXry#Rni|5Mh)&AfS?aurDKEBrI{kUV}&iC|m>e|xQE}EuRW-h|e+tQ)?y1O>A zbLzl9%J?7i+S;sgZD0MOgkp6HI;reI?Y&lnaqH+&kM7vqs-m6*I^;&xhNaKXHg?L) zJ`94+ua96-c2vW6=SZ^N(yB5VuQ+d)x~625#;e*dGCuO?PTOqSvNo9cb@X)dZ11S> z1b+@qY&dI;9}40oD%*dLF1Z5i_^WMK%`S?zx|-0{k!Z#qf9Uypp+@NHt{=HkM<#Q8 z3kK2S8+`vst@hGS{-IX{+bTkt*c$&|X#KDH-@wbr{Qn_eMgk@#2KN8qU?u`a4mM`i z|7rh||4&K4$jQmZ{Qr}XquYS(i?3GMaJF2zT5YykmllRwSto{DZDg1l7#m_Dn31!R z=)bO?ayQO?{wnXPYA=0jPpo#VcH~$=BO0-S!aIT}#Tw6eYwP3^xAe~(sT0{s)*^4lfuHliz0HM zi-MX;2w-{phc;KB@}Zmoxw?R70R0fbC^FXp|4c_A!~x~o8eKoMYEShKE)5Q0K)fNW zYi9gk>G1JvUrzu6wF)XaUhyTLt_Ri+PHw%BXq!hj+i}g^{(;!{Ggp-H@%ckeA?t{8Vg#;!zX; z)4dVC*?G~K&8)=L(8bWLw)!ZTeo#+~rjTAw4dGl}1-#VI+xsq&+g!mieVTWoF7A?k zjPLdh?w;)Q4Q%XeAJ8G4*u}|Uy{yRD^P}JUWcWnv{ftuu)BwPlnVHeyu>sCO06a4_ z7=KW8ccu{E=}D*b4&lLk`C{Q>0ixiM1o*9>_s8I~y!JeN!$`1PwaiF35 zw>PIz^`V-ZTf@GPe}rM0-?9C0dYf9v1>!!8Uey7n{d|6(jPDtBPvKr$b^WIQTzdZ< zMm}mDIy(C?|HjEGFApH^5B1N&=8gVlet8f)Xj`~KmIZ47_znS2kajqVNt_CLA( z2YR1*kvo690Ly>6a9I6*V~Wo2>6!%opL6k7%uWnhKHd%A{BfUs+P{Bf-*ps!ZG?Y& zppu;%o4?XBKlDF;;x>k6HdilpL2(5xPCx1ZpkZ;W^?!S%n*Dugs(|LUH>PiWEljm! z_)!Fgb{1cGY?@sXnqELNDK$DX)xWu^eJ5CcHW@W7yz(0ZBcF9vp#Ap_EPv8(4VtET ze6?_CaS7jPQ11<#e>sYSyVC>Tb*M%s`ylBY+*$4eJ~)ryhXC*O-q(ek^G|xhF#3kg zZuUR+fZp>@!0H?vLq9Hs+1UVO3w-fXK`ibxGhlYR*l0Smg0Av^Z;jsNA zPv8y!>BqnMap_Qg2<-ln5B%X-nEl1y;6B63KS6wkl|O>l0Aw%x=fpvy_|FM~SMZPX){$L+8Yi5m)#pVHFqn&w^{${NX=C&qqK_Uq|0-4-6T9z`iUX@A&TvKBawT zNBk4Ni`*SP(a(O?d=wh$U4IUSAJX^mKjOeNINRBO2}eWykNEdR6jl7BzRm-aQwx_z zfxku%giXKT-}>fX@b7{PxA5QXWa+;LUk>z)oZk_@244@GpJAUxTx0x~@Ooo$YGFOJ z^M~+euBm>~Ie%5C*Z4CI-|DVm{5M8_`lsesr;l;(_{YD3IVV9A__Sks5a++}l{omP zbog$^en7t2%~3TGQ7imH-o-yk!(%5y$Cf8B-(wj*_VzFK?_opYUE6pnFBR+oaqlCn=GNO ze5+CWUJN+@Ag7gZ%uALS-mAIXI(Uf%%jTC&!0ovAqKczmimb0C<@X@#5c@v9KUZOJ z7jlai9}d4Plk>T}<^kK7f>?QUzNL#T!Vyya1$Ex%-)1J%n9q^l+2`>zBE8AkZAngq zp2gz?Dd$QHCSM}%OMp{`xMVrs*CshbF=FUH1x=t)KubBJIEv+KVS*W&&C_znq21n| z%srOwh8rQB?S0D_tx5ShxvLxGqIR=ZF01K!+K<~iIfD1y%#oXt`3lD>8rP@zPDrtb zsB`z;rNf^PE|%CZBwV3vyAq_F!2VLHu1+a z`|e&oEK(q0AraL*>*uA^FGOvspG*-;1yws`179owO&H|Q*S3|~^$2AK-+cn>MDiZg zU86EGTsjEB1;`<0IUjxM41X*&=}^1e(B#u>c;{h+c7-`m_hh^l>FA>zYSu}jn}#Z& z8%aL-(5*)lD(+coG>CyTebA-8fZi#m1%AO^U(bQmw`cpM!d=bKN5` z=2)#*xDGiZl_g%~iZIFIv;xIMg$KbrZfD}$?y4$9U>W6bSn}wsc!)&D4E@TTLeRW# zRbi*ETofkbks`pTeb%*b{8>r4mC#+t8?0i4R~6CSceAc(eO~hGk`c@DsWenD61flJ z?sZtz5$Q4eYv55vKfTwm#y8hXo+V%d<6{~zr4xMG;bcvhSywAPUa;j*^kjY-;-|g9 zjrBcj9pj9>UE{XLTE+UJed=NR#~w(5x7h*`jXmfYo=mm;Kc$!LqRLxoVc`=_Sp!3> zLk#v`FU~=NDKMSUv6(~nmn&HE+XJb*q~eTv?&yi$giglsLiByv&)nNB7Y=MeN9nrdtGox6dma%9%-_9s3BZ~bZyBl zZ*(OAU?y(9(+0AChbfGpKrrCF!(0p}NVG@Oqi@?dCpM3_jaLQNSjaovT3Z$NItc(Q~0g;=f%;@r`;4M%^6 zgUW7B!)+=K7x)fWZn)*J zbSA=aPdv?_zKm=(MwYE6-=>v@NH?qVM#OdGNF2|cS-#!l`hMIsnKSudqM{<$w)^lA z8^~TSQQF`867hRwdSE~nV^m*c;0`^%H@pO`^cOhlW6N7TlaUy)#$LN>6Ii;@L59q1 z4F-ZVC0fEMF*AZ&NFf)XT)(7=x*w4KR!z2;Zzzq9J;rBR>!sLwtji|Z#-IpldB z)+n8>NIi6TOql6G!uluA0@2;swwi}QTI?upN^?q8+J7$D%D1anF?NHMwU62kB?o%2 z;wHXsa6F+u>kBTrA|;LN2*lc;hctp%0n#UwLd0a2^DGhik*;!rdzPrzN-Vqz$I|N6 z%(>I^uxflovK9A`C=*fsW0ByJplFM`j1(ra+{@=;d+AMBB~7mwzXRaZ0|#{{elGc7 zp1*?n^Gks74-{>x@vbDgNkgp$6{FEnT|NO5+N7>)K{}w|`qdqRc7Usw^0bw99O{gO|1tb$b`=Ev1fcAp9?6c>+J9<1l(ehL=8*R? zx{^N3QaP73jIml{NLX02jUORc6{VDq&FBIHbM~}Ro$U`>sbDa~wYq$)MR}eboh>dT zQl(<3Jh{_QdUt$Ut4RdKi3truzK%$ivh9(Ya<&{89Ei zD4>>-i|SN|D1Po`lr>zmr`kC*j*7Ax#z76`E><=E@Q^Sqq3u^DgQU?g;%{g-bqSh9 z2#rlvvEn3C5AJn8?a{bFu{s=L*6r=S*uREzOK7qvRXiu?#I$mS5}~FgBM4z~SLl?0 zv)@j8`X8B<`qc_2yZVxLw=W~dZGxa3;zi;J51Tv({is!s_nciURe2LD&|Q%dfUXBW$8~X=tF{N1;2i3e;rVQ_=9Zo6Fx6rnw>PXUU%7ZQ};$g*8<|=ZXq2t6w`PBE0gd=qWL$WA@P%B}9rFAmc5*FIohhsXI~|}wt@vWIE}R#!v^+7ng7%zy8ghBd>Pab zE8a=mobljFGX>J;xMZt!ogS9&B-74i+uuc;iz$tsnlV6=tzxwTz^Bdl87tC^&zq31 zSJ6*>Jnuy+X$}v-&W7yq&o(J&k>C62rmh=yHd~Q!{^3E_)J};oT4+VRT;LiK4@^Wl zhC?%aD<({ls{ifU5_F-kskPTTuF@ETH2R6NlVhmH&sS$f2#u77?-(9LGn=uNGsvFZ zug1*w*j%(xP=($FO{rLS&*YA9xE}^gA@=f_|v!x0b~{Nb&2V|A8IOis=|};6+x%7Vh)IOcIJ`n;J5QO z7HI}fbbZHC62? zSTo;+cM9GzvGk?#Xa;83A+cOZ5dn3Yb=diqOcXToyT`Jc^yL{URiGrkTB8$ZeZ!P35J2M8Z)wWS zeF?&nc4@eTa_y5SYtDmU(R56;mk8RBb?Zi|oMnX=JGq;7ATL7%uGBrZ9g!45Oyk6x zWnyKJf><^Nm{=h-T81R8?`MXvV-F)3$SZT%DN_+*);f+r;C-()2y3Lx1CV6+Td(CD zWWh%&s9@7e&!K)Z9+s3HrjZ9hszW704BxI}U3XA~Tl&P`$U{w)lp=wd8kgLA%G{@& zk;v-SBNC6~E|qp^9TeUj-w(3yuhd_*g0-!WhBT6!K>)Fm-7Fks!6$-c9f<@5Kj)Zs zXpF3#x`9L{sepb&`xxn1S0p@IDb|&Pn{V?dT9;KHXpR_#uKVeefJr$;L}57S<*V2X zlWC^g5hM4|lvg@$Na;2_jo|I&#W9GtvTvEXkRxdjpU%m|8I{%LQL6(X3- zfi7V(uR9T=-o`iR+1r>_)Yj^MDrw$i-VorMSmI*-EohqlurlP18oKH&a!<9p6swfm z%Z#^5CrWB$wUE8CEtU~@K7Bo;b6S?L(_VwDR^uo6=| z_JGX5vgWF}_quq3Cxhm1y;-i-J1QaeDpo|TC}$N{MuVbjnui|V)Wov$t$U-bvEd4$ zWemL!fSy8OOCly&+(BQETD)U0N@^GulhZ5uh$Y4iguVH@8cS1X`S zSgRPW4i4mCdKX@?--Y0w9liRh4NkJOchqN=smub#WMC2jnl2oav&L-}$U-fzJIc=% z)SMb$jY%`RcJa1ejoI+F8Q6xjirK0#wtm_Ja>bH>%hZFuJx%%Q%ljlj7m{2q(S!KM znTdlv2wtQ9qxcD8o4w?j+j92)9xXp~bJ`+!>z&i-k#q=Vc3^hOA~3dab|UxKYEhs5 z6#cjE6mL!AX6Tk~3nTOi8~voS1A z6Y9}A4=!}oV`6a-^G*&is z=zNO^F|f^;tFx=Wx#OBN zdD!4uwPS(T6T;NyhS5TdgV!OlEx#=2(gB5R^XLOV)lgg~Md;b3pS$obOH3%Li@>y8 zahJnJf%9mR!qj6*>)Wd;(sK;Cd)Rf3we<6Vhs(w2tYf#**{y^w9<($fyg|deM%xNl z)vx=FC~wpzZot5!7k;T^I9(Dmesnr*FvKeR?Nq8=w}EbyGzHYWs4z^?(^)}*a@27P z8L?gH3*XH*u7phF9Sn3Qd}Za4ByY{apLO>lZZt?QssQdQ4469}hOVhcSilT-O{{IW zlyPH`;2nQy9pX&gZC9w&TO)?76cQ2oq&M`#lF2$_vokVDZp9j6Ft4lm6>Ul_vS35a3eEC_*z~cCI>?n`*{um)H z_QkePSWe`H=KbdLc@qNX)1s_(*i3_JlKgN6pSYU=sHnN&!b(@X%Zgjbg%M~p4xQy9 zA~e~dl3=6((pfn)o@<03Vxh@8)NBy$yJ^=vFc!Xvb4KVHaQy?+$d05 zaVv)pmJ3AEFvD7MoOXT@Td&PfRpN_&hr!gM!odnJ5WN$m>4Pa$>+88lTKNixT4#?c zrtbw%XkvPTOI-~Cy*YnxrGa@z?JNe>OOGDg^YsUCn{5#}MfX|S(q={cHIo{_DIQ}8 zUlkV_jW?vV-yKkbPD)1o^&lvgFzYtCGnXVoT7K3>B+18fg@4e)i$O;cfP5G*^kmZi z`uVF28h!;2Ml97uYu{T1(=$`7bW&a)llkb8*fN6Dw1ZG3bFaFYUYJ{>)Z8&iCcq5o zOE2%qKS$=gx(#(X_QF#B2ev{f(d3n|wq}T6p7jJWdoUkU=%&m-JP{^TC!rx5VylB# zvX@`JNbW@2!{EU!yiHv^>qXX0)AOqwi2J_r{3#bduCx(i&ez?Di^SqA^+ zGV*f$W|xDHGwQIO_$#K7>%&O6kJZt;pu8DDON>~cI5>_aO_9xX|2d3VAddYOSl4Zo zDMqhv6c|;))66vOtNuj$SWwkq`W@P)7xPi8hTs30gHo0}OXF+mE+>sEZ z$12sB$7^m$PrFbw%9w-Z?lx^RF4enYqc5f@$Q{$4X6)9WaIWjs^&SfPg$g@K&sJ3+o^lVSsZ6)&lKK)=uQN=mQ#sHC2enT|bHSR? zR!h<|X?kVHVt}}Uply_qQ+oi?%SZSn6w~FpRX+`97J{0Ez*wJBUu35ke0^B0IBOwj+k0cvui!(-USX&HcFP)NU96ojc3}Eu2hTqmJhUWn-g?21foEwv5%)hI>eGWg z4hMF4KQbjy?_}$s_K&bV}$TNl`p*LtY5s%Iqp{7>G`Piq-4hw3txiZrs)(?kJY3U0(sQz|5~EOn48odAC2e6u-J=H)(9gkN+tVZ)+^`GSfa)Y@tx3kv0^ z2Sn2>p|VP2WkZ+7KoTq;1~zhr5^2(T4a+R%is4{Vq{>eXxbalbw&eA(n6Vbetqm2;B;JRWkr5a+~4VbwJXw(IolB zO4^pG>f+@U20$b-BdN`%ZN3^@&(hmg& zDo%9_+XU@LGA|Aj5Divu=w$H}EeJF5;dAY24aiK)-H^vEdjKCZRbn=%$sV1Oo(V<_ z-D0c=o&*cS8ykJ=!rU>%#P%>>v|RC<3Zr1` zG2bO9(YJ~&xht10eaAfU+q08T+sn6-!0n_UQzN(%S@KM?RNoGPAYh(n0fzR>JJUC- zmlDfs_=Cue5qvxB6p}Tfm?j1RkqI(!%Og=-;n#kwVuhs$(Ql+Sb$&Q#%x|QyrAExU2X5oS|$E09ZcBXz^pBS>FvVt%7iF@*@3wV4-22Z{3G9=v zkD`mCJQd8_u*YlOlV*{C5?yO307ep0=OG9vTWpR!HK`rmb-*HBdNE0nMe`9$4I069 zmV#0Dd+C7Id57I5Pt_$$y2({g{dab++*d3X}p&KIOU z#WEpmn6O%^Y>(L@mIq9ymb)yijd#MI;Y3Q$q|3QtzDW*xWInhy>_x2t1j1329p;O^ zMDrgen0M;Xj7nF*WBJFncDi{5G-c_8xhOIadFgA2UJD(sJ8>RaPI3rdZ*kJ>&Qhuaiuqx>xJV^z3sOCoF2C@v?X`7(GBi^r@HrXE?hn+&YL=4nm9@oATh<(wWA6`OD;3v`8wOKn4^h$65b zeBI>oI?bdA_7LjMxaL;wNgVSbQ^PGe5vEr%!p%YDSRFb49kuggYZ6IS@n*2zk5^|5 z%%1+Swog0Kf{*o48yOFC?z!jY(NTHFOg8w?$9p;EBN42`=af;QPL%fTQ5<=C%OXWQ zhER6KPj{zt)eKw6Z#h)HIude^hxC|C?yjaCk}T}p)>rA@kLW-Rcvi`9fke-q_2}Z>`cJI8#>-srVZKRYr;jz9??o;Q~97x^I_ZBJ?CSP>7nOdKF z!l`fsxlAsvp8*D#I@JjOM!+x>pQoZsI-;ldxO#{bgEs@-N3)QdKe{-6TYu#0RzDsU z%br8VqR@Vz7f0ZKcBO1iFZVneQHMot>+ND@a=D{UwHaoGk~%F;p(lRh_2Jp@%&hG@~F;QYXh62WvMhO7F@A!SM+T@9=X7Kmq>mtf%KND0t~9P^BNfp*NKh75bLk> z3aJgD02IeG3^rg`i1+%oFXXw&G!dF>)Be$}yh$&UWbx2u7JyUmL#;sC3W?1)3pT=X;e|!iOe~4!mrW1_D@u!Ocbfla*I&*IWKn?Jp1?)BEYQ@nNE1E%DvAbU_Mf zku!`*%3VBCB`x3AU=HXhj`W91C&70=VqlgF=-QeaWLT`Wa3 zn1xiqSErxrPrBD?F>^!UoQ$%#CF9Sc$^3hUDJk} zbv&a)xs+q%OmS16owQK2x_r||$NaYfSlOhl^gbV8kRHMK`KBNIItImZ*AvZF8m{K0 zSXGet>OQg8v`lWY3H(7=!}w)b&l zuMzT-h6n8j9PXc|(}`uf{rN9;F>Dt)`-tgeg+SJa^+tD#xNhM2p~BcYC_o0K_8St>6#3OZggdh(5Vn`vz9DP#|dtIKsmNVbs%YJpEpsT zbc`tLF`?~>;`yY!8`kdS0^S1|P~gTErwqM(JBM64ebwZTQj(}8%9S;Le=s9>b>52J z@Qt$cb7(Z)5Vr|ZV7{WWy2oaM#6aI;u2BiY_rZA>(++>CPHiQX)_8WZ_%(+Hb7=3$ zF=HcC#$$~YYvuxbC2gra9+4q0C}$8_$3#`T^6!vwXC_$s0uzb%^1+@<(?0Or!~P2y zaB^)RI6N2+b1q=}>Xn#bAcpg6n(__sCgcJ4iX-@Xgmd_Ygczrx#~kA!buoe z2X0_RJk>338Zw?vcJR_JhKd4G_{zx%BrsBX*x3a0{#&}Y5b{@m`>doIkjuz3R7`vl z!9}N}AiTEtbwJ5GZf1iS0IdS|igf^&g^-Ph=F?iq^A7x_q^(v>Kx0^;?pqL^N?BbHLJ>~dx)Tfdsi={5wF3@uGz zESYp%MSfE>Jw;V_0dM}VnM$fX|T`Mar6#WzB z(%7kDC%qqastZ#9?YQfX@FchVYKp3rb#QP}j?)egKxpb5Tx7T1HD!x+Uh+2~_iOf; zbyX`OS^uaO{^_>RN%>`Ii#-4%4k9bXWo%5+ERPcA%5jS^9oLgUW+GH><(3a6Mdj~J z%N`#vm(9TMx4Qztj4;;U7nT3&_z>a{k#RjyeUg#Zfy&aUyros9M-$KHQL4U z3N+-ox%RfTM7Q)^DDbx>cv$mYi(S7pnXrOzf+7HaPMRY_jb>kKl9` zmO2f@c`9NJ>SV?Wd+|x0a_-rR64TZ)D^=T06qFz zC~vLT8CUiI+_awIu?Py&n=u4=+^e%+K2eKeqyoMYm}x$Vpg)rhLJD?$uA5j(d38>D zmp2~|b#%uX_87_qP0|uy`vKxj?6&l|MmYKK~7zcmYQ>*gQ_W<9}7N#51q6) zTVyy$(HJBGuDdK{DTrV9S?i86*++|=|7}T{H@_kT;NP-dTWGB21))8ljpk6og8VW= zUZ0C)Y)2IeGd<5QjXeRmEKQf(nD7>9Wy`?{)Kd)>>Z+y8gRNTCoE1H|na5>&US!BT z29OwX2!tCXvS48eRpE4?*cPleSQQ&jc-WMOdzXh!cGDXn2HxH3OufO;obnFvqO)u5 zba={b*(k61SIJZjuT8+9rZm=JPKmD9ZXYJz8sYZP*l8YwAReMm&3On`^H59dar-$F zwNTw=z^;-4XrC@iCG?$iU0yr4kbSmKmSlUZhdlHzRH~TFo11#`c4+uQVS9ZXCKs@sii7Y9GY8Xiu}!H$<<7{c1=Bk!;}Y_Mr%QGapA& zxhSpT(qL_&N~gk|2s}#Xb+Z{ya{)QH7?>9LU(gB$N^ zDwX8QlSoO-;q5P(9>+C3{bjRYf2Jn)`}}(Q;Tq;2kpg9xu+vE{lZ5Fk{KZKImMBI%2_sN*H1= zM#_X#?`5&waTQ-;gz9NjH`IbGB*Cq zPuAbG7Efflo*S#TtJxi&aG|jQBzE?p`=n)xAH8b;jM|Ri=S`Wb%`oH5dp@gT4LsWN z;qPmx^Jf?dvXBR&#_0)bg4|^>?%r$a?y}KiO&s0^%IYjdDD$1<0pTdhuP$wQz-@fs zi}@Htr(NOm!rWN~kr~{hhv)VrCgwDF@=ZZsVYRx9?KC+O2MzW0qx%;vBh(k29rQs% zdpI#)G@mo_S^>GRcB3>Kw8f$salJ3q%Y=tBfKvlbFUD*2I%qbM39do&GoBY` znMhri4e}&t_eUvug?kt}Ui&p~^W;O~cxxHx66Vw1O&aZ5O`u#-R!CRh~)sBDnOWS=Tg3@~X*1dbADx7R^9(pbkPt zi_k@{=0;G#Wb%$5hN7%!eSN#y%8@=$=naCyF=BKd5v(g%5yP#prDIPFWKDJX1G%3D zS^UHyJd2~Wv=Fio7{9V59M3WxYu_5>w-mIkMD^*c@4Jvk(t7+XX$(;FkVlkcv6^P2 zNxv?3r}S{L)4!_EHU8n9vUWAu+X{Ti_GhK#+E)P~LjP9c9S%vLvu+duSM z7v1Y7gh42~)oIeNtF^b~skfcfuQekyJCRendkjMh@*h=V4?{V01Vs?Dr@Kgu`L%&4 zp5KN-3ozbYEI%22$3OqBq>nPlSdHp4S5ku3RCZNjd`uw--o}j9SpB?lB|2lSUL4pK z-#sm}QJ0R4c5ihz)FSE1%e0{$^v7zo`XrqBOelbm?gk|v>7&5HQdgvX$EGsET+hD4>Tq@*0{M(TG9_P#$D1^>W3DFqBRp^?9x1byx<0vT%J^vb?^)}9J)4zH z8gABOZ@dIq;*@ME9(iP;bCfS2-%5}r2Z5b+y#*)&Tj)Pq#`w}I&pQJcWZ4QG#Pu&! z;lF<%*CI8BF4VOp{+Xwib-cxU{ZwrZ;5fim2~ppdh`7{os6CckTV)#|qO_^^Bbm`i zi=dK++vboljJMvOCy#dsIAFY(*suV&&fL1jnsHWBh|ZN0w(5HL#Osa(9ZwyLd5*fa zP%D{|0f;H?)KN%4UJ~+$GhkX9F9^Lm4-h?=2`OsrJQTuP8D0*Q8NJ(g(Ou@rD~#`a zQ8=Vtpa*3aEp}hd)@okDbPZ_2Ld&}i#|~(DX7rzw2UUzo#AmNimLzA+*J9i+wYT_U zrsom!RClVS{wn&H;;Te07_qZkOtB)72sI_%l0dE8Kz?15(qRG3L0~!VS|q4Pe&1}V z8s3wE1`Pz6{JbBqu^)x_<5Z(H_JDgCu<(rat)_0xCW3%PQ0uwFkunb3^t^g*6H-Y} zn`396NQ$R}iEMV~-{s&A=1heTTzx#v2|9OmzYqmH^#Cb+S1a{p*eBf%k~FzglZ09LS@jCr#QOjrc%TmiQB|E<3NF z#Enh=2m0Qv=FFi1WZ?0uL)S?bZrs@b)-SOXzVPynh20HdMy#8e$nOW0fKipwo4;BL(qqe3BBcqH3H(QB^4F5o+#2V4!S6mo0rTNrn?2ScNwQpCkKH|&5eLeo=54~o9?qKCcn3{t4$W|x`) z+G9b$7CE@Cr}Svhel2lH$%Ib8pE2BTfZz>~N^C1SJC=xh&r^ zXYFiSU&&@z#kw*ay|2eH!bVlbZGI63eTC93`61!w~fR_F{2!#hmD zv61*XtKAeJUOvU6`eGDHh=fPrbFtD7y|o19XgCQZH{NILvl0k(+YR>+-va;8-1V}) zgyZPL$1Oy#@G)-MG!Qbq#Lvw+%cCP!Y zgEp5m;3Yg5Vnn46NlA?uu9}&-(Vq!;lDpmLI(zDcwe%>x)!Jk4KR(wreT8rAy(A!7 zz3gbFF6SyaS{gAS=zG6`pp~4mf#a?I{z01T(>#6vxw`1_4DytlP~s3`HN^U%Z&r=% zvIC1|CH7`Z4ew6lID-_8+^&_z|mql2dyZkT4?x{%=pvx9;*~VM8 zU0t?q+cvsvqsz8!+eVjd+qR~^iI@{{;>^X#pOAOC_S#RTJjxNnA=iubqO=cos-x=3{ zC*-N~F5J4x$x#GjT+kOhP-Yl)w2R`~M(k}>n|MYCFI5&CmrbiL6c*; z8x#e`RlbrTgetrxE30vdG#nj~>R9>4p>uSj%k}nd;Eh%3 zT_{oH`z|UKcs*NY1+tBIrU%7s-Yw}|YPHIPOG>E!Cd%*yQ{geb;Kr1$<`hS*8(*oc zn4$IkiQ%rV*Eegm#9N|pHFEZp{9cAT18pcr$5d;tc2qCC4{yf%DF;%uU+xhI^NR~s z)6Q30JTg+gDN0P>(e8a2 z3o11u8o5M!91}_eO;g*wwi&C(G7aMm^0H(tUUkeEGoJ1EFI}4qLUzr z59G}rOm~Wu1dE>>Jg3E8^##frSz6GJ7WyA^`m2L<^$Je1mSu;tq@AIeQYN1p8GM z4G)gF_LPHqK;9-hMTS6lgl)T;0lZ>|OgJTSDC=73{mqIi=H-i|jxIH)AfvQ<&mS^h z8{DAxI^ssUq2S$znjR;hzgkKVPM6>DXWYGrJV%_sIpkX+d+vPI-k1u(?~9Bgph5g} zijufbq_!Z$&IiexLB z2d;i$JpGGV>z7?bcq~0$*)apz8F6fcjKMVnj&-?@$-r`PYEz{NWK<;7Gp?+>|HQLV zQLp869XozS=^tlXWFjTfw(K)*R}n88bxDODbFx?#920pvPQ;r;0FwH!MBs*N7W;zW zUY<+YL+kUj6*TN##5wCA@eRt|r>X}KqZzb7QjDuE{ok4=9jmK4fP(f}jT9P_4%tD- z-D_EvZFDrxHE}UVDiz@}reosj5^Fgg)KxeN&^t)ru9m;MnZP8vEwkOp4#5j`SYH3C$iISIJV$j)Vw?#QE)q6*`w*p{iw**7 zwlp_~>msG;8M@?|*wuPp_|7QwVx_Ci)(;4*oGw%SHft~}S}a~AJNj_439-WZNdoQ- z15;JDdkiHfXDK_*q7dW{?xp5eJ2vfP1RYdRM=F~GCV*#up9ZmKtx^+P|G<=;p6X(n zmNB-Xyw)e;LkmAX&XQp(>s@gVuqJ!NLb^1dVRN;pK3H^@OTW`n+PYbftKsz+D};`R zS2u8RbDzerbje`rsOGmC2Y@fxle_TC88pQP3EH+TVfY|ZDZRESq=#^HiVnV!koX^8 zHyq|g-}x>``!R`Z?93sB4U22Sjc{Rx5+eT%xBJeT9h<)A#XDJ-#99n~3n5}6O4JP9 z%IpS@X1BU(R1+|34f7>_8JC*V`GZ#5B}of$o)l`&Jo#B*AIr2uES0L1`6TvPNwFQR za{GLY*XVkpbK6ay8TB#-EK?YANn9#E%;}?0JUK8wkUDuMSiCW~=~ZHgPz#p-E3!PT z%bYbRbb8{HNAdy1fMZYxZDCHzY?tX}G1J9D2n|tlai4V{ugu4>!5F6*M2ATz|L%$s zJB;r!wqTW8t$E*?z>PZpszx?uV)8^a``vRCZ0}`_c3I3{y?_|tRb7n3t@tqclJiYG zwoW%$h-3mgJHVcKF4C{67WD+wwl-C^@N(BmhiD{>_?n-?h>8pXS84iYyEo7|I2&j- zfoWCtiQNCWA3bTGT|=#%ADLf!fd3|C6g5tBq$s-O;90pz77jX-%z{21W5@Crr22h^ z*1)WxjQpNo*`9ht(jWKea*oEt6jHmrdORi!$=kz-$8nX%kqjpay0Je4N^)r|I(vZK zQGJXtUkAjFJtrNSL9?JwnZ(bfwm_9IIDw`y8o~^iF!t}8gnuZYlO(u^_WfLMty z(rq+$E7mLX(or~?Of;Zk4=5A)a7=RoC~?h_Axz{)r7JSJ+!_o3_L?FegQyFrH77xz z9rVs*CCcLol5l`$zP0TlKd5s2eW77f`ru*L60(g0N|;z=0vM>pOYIN6{1V|FqsI)6 z0wcHkJ{`F$8Sd0oa`{aQ1|b-wC1`oWuk(j4r%QJxjucZ3pNJH~YaxGXFX1IKtJY7Z zD$0w=It1Gbddn%-Epe9X0$T~QB6!anC_h(`yCE6_m00B5!lBDs!qa$5TSg72)L!v6 z8Xl^VzxO<$j9F!2*CbecxQ4dl41Rw@)r&3%ORmXn%mll<*%i^)?BMH>Ty>Kj2LiEs zOrH&I5%JP#vy&H#e}NYC z$cNu|qhs58n?4>3;xugwmBXhz=P@%XH|CXab86t?3jz)SqH19@d_^|u{vv6AVRjIV^P4eK_T( zhCh{nn4J%+rvWZXsvXcW$WfEa)CQ-A9?s`v90VSTXQzcgt&F_#Ej=>8Myw5&Q=?WLRDv2h(Jcg~}T4!Uw9w`VDG{cNcb4 zat%!Q?R9bG(*BQzud=aIquzE?#Mo36+Gc5y$Pv2?K%1bR`bC1VP!y3l{3Zlt7aP$h z8dcp%Uoeo*D77+)(uY;!#zu6Iy{dyjQQ;&@XG=45$D+2`Y{3qt=P;z~hXUeWb4zQ% zh5Ocgf0=&jZh?TrY+Q|lg@eplf?!n$l$Ed!hzZvfnyL()gp~;x4~jcaBS**~4;QXV zpiFLdm!92Y3BK9ngp~L=yc*7sbN!T9ubWwdoKB9A5+yxNU{5bImP`UNk}orS*cggs zR0(4C-@UARI+d#j1E!NUh-#)2KTi=G1DzG4P}ssX!rirTX|268Arwg>ei8`3V51Yw zOZFCc4DA%Tp=V=BR`L~m*P4M(@T8_K6vw)eC~xdun7O2;Ac}S+C={f_InuSYSm)Hs3&^F#gk#6*eM|os@p!(M z$Jf1h%$44a+tdDKUy7xDYxwufnbnh*NqNS?0^;+AcjQt>+Lv2PfbDahDF7gLf&gj5 zO}W!xe;mj_?voxhdxugNi{R5v8Zs>UW@)PFF=8MT%Jz#>F@Z-0M_7V79I61H6j}Ro z-#bmBQ~$5$32mhzBegm7fow}E-@{U9%L>5%Q6H^2i?x`%4YIa5fY!%jWRKFB_x;!f zH!*tj^tL?|yznl=4}lasoIh$_e5_)%Z=laNHkc3$Em2Njue`g1V+7G)jIoB1g{bg_&v>Z z?63(TNf>%JN-`8M#{6f}b?NU_;^$a`3ZtvwL@j6zxSu9`7E$@DZLFyqoB$mp zMo-rMHO`{4Lf{F@!1_YJX8P-%xovIVP!xf=xYCHz_VW5q=-LlTBWPx zo)WG#cJ+hLu6FU|J^Pb_G-~Rjgm>;^heJM?T*R+ITVC_&w=%Kl1{dczblikwh&UNUQ%GCB?Q?Hje z2B7V+y|C1@GtFS@apL(e8*nOoEYx4tULAP@x)b$)G6OQ9jGlbq7b;M}q%rkk>}bvH zRE@o1EU;`>8~0D_JsQU#rG$3va!R+fJ)z9(-(O0w18fQ{Hr_jJR{2{5+|7BB?&5hn z z6YJsdO%1*Z;Lb`fYF0AhOGhGD&UWL11)m+4xT%+5^?TKC^Z7j6)M&I@Xn2E;0Pql(&4n)0q~c zk_vv2baEzQHn{+Lsvq!Z>BM|%QXpiF%vNE77jui3vp%(IDr9Vj!fQ~aQ9{TZmOwEn zXwz_CY6WBtG%0>Rv=h0-qIRqc80&ESk_>#PyMf&7oQnWkp{3&@pc=wBX`n zyP8LOIM{YZx`Ln5+ySQrgstz%d4$5UPsa7Cb6rT;ubuz$U9{oI;t>1B7Gayt6ib9j zFzB=(^Ae{#3sukm7VOMH~5NW zoqH)n1GA}wcliXDTLnUMXGD*gd|JdskVLd~jp+CBU9ErUggQAL%O2P3md+6!1N%mt z$Ohl$>d|}$gNPp96Doxwjp!>PhaUGt?8-u&v2n9qN=iFVl-RtZe})-&0$gwFRa5FL zv-K0SoAO8G16%vs?!zyCBA*Tcf!Hfvfx7w)5TgZytUc8AnbilHYy^>KG-od23?j9& zojmTCa{qmb-w;$NCG_QyA9FUJJL_}r=aZrrh1F=%fbYzmm3Rg5Zifh1$Ew^2sX_L}xm53K<3!t|0DffApzI|~ zl=-gieMOADr}d!b9B#mLr#COs~I+Cc(*2E`VPke-&TX=_>5QXvw)0=3%bk`-Sw z-Xvk`Y|A^6TM_5}EiEu}q$ZL3J>Q^k=d_Bb!wk>4c=rpRQ*Fx!Qr$(`MK?>U7Ng!z zzeHf?FL}(|P1@hAI7?)pfSeib;5EZhr9Zj&j0#E0QLsh4Sqx+}hH^3bAIBrt)p`?e zk=>KY4dJ7T0|DdApM8VPDypAz@ckRPNX5u@2g`gmu?=ErPK#a6{ z{b8gU(}foCu9;`*l!i3N?uB7NqzNl5kKR?m^?Y!8dJ>)1jOi@ayB@}RJ&TOG(r^0< z&b<1dAqBi~e|IN{gpug*mb^~|1+PFM#)w48Vw#>nJ@5Au4O7&~#yb zyEbWZbrKIvxOs}C-3JiMw^(K9p0?`Bv zCi|^i!n{;vFL+ouB7JyaA(qq18$EY>o)W`H?#Idc-TPgGRlt|I1#c);@_d?#LvPSE zCk9xvek4X2)jecr=c=A8rb%+*5U^#eql6ntgT@-zeEwKMDZLCV1*BDYT@lRs%4jx< zsy`F>F-;Z0u{S7EK#O>DpwB&LS!yvJM6DSA{9b#tiYJgg%b0kKQ0)ePbQLxmn8zLXd{s;4L{iS@{t41;L;QCUI_-~}aIn{FAsUx9WQO225{aTz)c zvf{;NK~hETNeiy+MNe&2bBT4^&eAw_iQm$B@E)(M10|XCVz4d@N8x?nxtffr}!TGze6uD8PUZ{l36@a$%}eaMvYdM zMEoiH!Fadsa61uKASt~d*GXQ8QGE`GE5=`QS?(@4+v5W35OFqV-fRg|D2XU@j^>*^ zHYIckYYnmur=C2BX`;E=jF4=*m(hRL2ZtfDHBq;B>s6@*&*L?8@b7)X$(lMjaK>>$ z<2y;*@hka|1H*RhFvLVcl14h_p0fZIr?%W}1(`fW+8_~vE`H?_d9Z!6f_wYe6~6JN zcXa8mX@v%Y6X*An3Vb!mb{p-U&n?$NWIE;X?=?6LG#@+k(-TtunZ9z4NYzl*(DySF znsU1&NM$&p(sg|#Dad0J+@eE9OS8d*O6@~e6c1ZUK4s_D-48I=*(wtsyJ##ypE?Qd zXBUa60P>>`;mWi$&e`VeFoWRei(+7_t%M4+j1Xh(n(Mx>`#n#4YGcU-fBtdM)YL5A z3!E82H2b5`A3QGpw1@hLZo?JK84VmJTTFX9>!=VM0dr0%jL1I^hU`Fhy!0sYBN~v` z!hbHl9n}co5MyN>2yHA7h6mjtk^N?SAHKTgW0!gG_+d=uft{s7GURe+wp4v)V(=KP zw$+2~JyjIC#`OGgjlIFld;GX`8oz)C}(z|(5-5-*V$NwtMGd!k5ncX}phP)LPWyiSsRbYnraegHh5rJ1WE&fi)=9^)+xs{7 zt&rPU^?FiOcULxa%W}L#i1qJrtI&?c{Wj2Jic@!7W+E9o{;l&Y-_ysFX->bzfr2st zt6&bD{+aPg9GzL0vJ7{C$1v(c7L}JJ0~L9s~17Yc^fFd)w-}nb?eeCpF@V_P}Kc<>LrALnrDv zM4=f-oIulszO^g4mF8&PK7Phtq^f5Z3}obyljpwk6zQcD|3LF}`4+2H(io7~w=!TH z1f4(LJxTXgd!ECs^CjwkA?H~C7jo_wCp*Xgz;ge{Ic8>#|0B@-U*sGM8za;ICpj1W z3a*f)y#Nk#aBUY5uzUUQ*nwhgZV|P0dO1K}UxUG41G2!NYf_!`#yxs}YCBp>tjyG2 z&bKRHuD9i<6ym5We}$)QWP*@gUFl$IV5WNj8^N5P(@p}JnX!qPnW=%nz_`Rk!r$_{ zg2Bk)dxBL$aF!$3Wm>{#D(0 zvseK_liU3Z&?ax=M?Tp<-rx$Eb3hvI?ro<(2$12cAlU+DVVUMud;T8b*?4@hs0AeXWB_D2&`hNsm{U9^b($_Y)(pEhJFKK}RggAr(l1e#q!-FCC21e%J z7;{4-Yu~)vLsMWT#zyaQui72Z0!kv_2KRbmVz1J<9AnTH&pkIeGCzDe*4$oh+&4djuyNV#!}_Laom`BTs{y#W0ZjWp z$6c97->b9%^gsw^W@dJ5YCyBdK#znaX zKs)?MUD>jm{y-QM>Up&<{CGc+NE#YI$Arun{#3IlCSkV}ADvhhcMPAt-*SYA{`e1G zAN4?KKb~JNnf+d2=^Gns?%(-8K0`8E868aXR?a{4Z@qGgie8BR?4aEK0m;$uef=Zg zyN7#Vuiy3nQ`FgB)lU+ok?8^az6XZ-o6bjqe!HJ*pmV=iNcdepC#v<{7dj+?uhOZf zbk4M??H!G8e)f+YqMskn&sy4_TH9|wbjgnO)z7QaZ;ZX4QF|lUhPzMpgKm3W-EDJ$ z{%;?Y%Wno%*w@;ARiKO@9owH;^^xI2AAraut?hSbJfXQ{9OSGH)}hh!+0FKAs^&p0 z=f77zaPn2Qpx+iP5Cp*N_~*cjfS&o{Wn=KIo9xFGc*p$2Pd%xQxsmAX{kTxJ0 zqyJXrJ`wl-v2&joj{mN)=$nuYC~=bD5x?{mvU`;33&W8aG%e|e5CJH0#1DDq{*9yoECcun!M*uSCt&AC%<=tuqm%24 zWH;OB4RH*?;Qdj5?3?~0v2?KXrSlH)U<=|M(p&v!=4t0><2&HyVEi$5^#wLy{9FFn z?fQe{fZ*EA1;KCghT`~^kMr5@+}sc2!rK0^V1J)^uFdP_wXf(-tT!| zH)?`{Vwd0bM7KJ!^NSy3JJRU`(i`LQ8?vEif%hAD=fUF}{-@M&N+&yO+y18aCgr8( z9|!lL2n^;4G=-!FxSDzusyY|uy6KHT>R`TIwDT48mCP=hLE3U| zMf`_@tG@Zi!9mp~bh+oYunvSaQ=Oj20&fDk6jT+?BAnB*a8sxUuZ+GB3rN7tS<)xSHzOJDxDbkID6=~ zA;)l{{%A_s&lRd@Zj#+JB%IO(6ziB_i`7;CAwtQu=Ply*csev?bV#?&QgbgNAM@eH zP6KI%jN5H|S&G85xWJQBOu0R!-P|H8hDp|vhJZ&D8{EAa$UdD!-`qDq=j}#xRm-z; zma{*U^IMj`@QA+9yc{68G^%Y4w+rtL=6pAz>O;~caCmt~(>6^`T=78Ekj>?s^uTp5 zbAeZv`ur^_=uam%E+9HcJ8g4ZfGOQpK4`KMR(o{D2p+U@Q#gk+sGq#3VeH(+<8Jz+ z$(WxxP4{wmbS<}dXqm{POFU>8F(+hXnsTd*(Fp@WMv}nyho;uuu}Y{W=HTKd>&J)B#uM&p3nUP6_3Mu?f=0S}fq&F22yOuc%<(=XYI;1+@BN<#b6O zvJXB^Z$HMbmcp-gK~W(yC6$klhN%{q5gaDwNCDJ z!%5qyg{`xX#+F1LptwzMVDpy-fJ|(`_XC6g3sG^@+;j(W=Q_ALCqOeXQeLu%iy)6u@elPQxkqmH7o}t7ptWPYM}YWSd`pHn(7?_V0&`En=fl1 zIm}~}(;I?9rg9j_cD%u>z7(L=R- z*UT_i9rpZOPrSzOYSJ^>e^`}+MkGe^8SNTXxC-blPwI&iXPNVG9dl-YwD}}MoqpSS zHc{t=GklTR@%2fa>v{X6Hvx{kE?tD(B;egS8+pwtqx(Us^By>m2dLlsHPVrkQ{atx z{-g?OTY#k@?{ngJ#_Df^=(UHv`vnVFowP71nY$9BRIetFho~H7b zso~zLws%T7!2x?qj?T2gQm`bJEz0>-`Ol1YOdSpq^G6G#HW@ z0K{&?E+X2JG*&cF{@DiVvquT&7z2z~3f%(!@$3`mI2=rG;4>zi$gLu-JGXT?ij>Z@D>mwS~159ZK9GJ8eR>$w7&1KEw{_+$7#%7wOiuCMgNk}oFu(#TXh{!OvNd#c#6Hu|n$ zh?W^Qfsx0UP1pr3n8%bIOe(Ne|K(k-0tN0`MYWN7s3%+yYx#HAt#=pTtzDRch1gTd zw~bH8i8xDRMq8>{)I|Uh3*woYi&QlrcxE?=BVP|Gqrtxr6vkf&9=W)6I)IkDNaP{6 zB8hO~clV{n+?p>fJ0FHfE-MQ;gmsZzc}`(q{gnHsKc#pGbt`w63|U%O4qbh9mBdV? z!y#Vz2>EbD1o32fZY$l8L%l(GA)YXLovZT!!=J&^S1KQ%ceyaeoAYt6r@{@$$hm| zcEBq@8}?x@3q{*|?sKIGde}EzraaRv3T2js{>+C73p^YDJG19)zwpgK$JE&0FbT_7 zpRU+f%k<1~LK$eqR}WAhL9rD4j%PDPeui)0Zv^x8W-s@#ME2h z{NY^RR!E!K(zh%C6A@f`@RV5iar~Fz8EaNr3KrAOow|URu0Zv!VL0(~IK(H?2IIKJ zCL6}N0Z?p$bR&MI{3Li7D)ZqDTiD*wN~x*cYBgH5GqJeiTYk;7#?4di$diU-;qmo& zFrPHhm{Nnio($cZ_`J$i#~~3ijRDpKjPmg2ui=e)$H5K@)V=!MVTVs0?d|4!gv?V= zw+t4vG)!*s<%0{Gej;d-kE`;*BHxFJ_H7)kD)f_;K2t|bgr|l?x)cv{hxHw=&vaes zO~~jMF}1xd-O3^SgfXzW<(mwmoztbV4SDpOETtftO(Biu7Y{Z=0!2f!kO`(I0>oE% zIA(oD`6XG2}VK}3<7)Sk#Gn$2tHAQL$1f4qk>l}JABxtZA$W{ zL}9zUvD86#N7+$#+-Gbu_$g-5(r^q@)$)GMC(K1lzlKD!(Miv9e=KDxmTm=MVzt4g zZ(Q}hDiy#RhfO?Eo}8H=nk)nvSeLZd5`D3Z$`yVRCZ{M1rExMM2Q(hoL9kE<+Plc0 zEN8$Jsa2pEvUg^NNW{i8?zZz@;BrA9KN@)X6x{XFaD(WyWX#t72U^*1@32ub;`-4( zk2lecSuJRwpRElLZpy^XLZ>WxS!V_ATI-#i4z$j@uSJ5zrsc0vk9#>0Ju+BX^Oy=d=YAr;RD=N$?Z-5ffw(TX}W6Pos3vO!c z5a;_qUawwI2Hre?;m6h->rFQo_K$EzH5 zGt^&VGXh15XxtpnDmt+HujX+1M5$I5{cUW9v^*!$5Cn-nQAW6@*j zkEcU8G>vv6^vtI4(u?HQ9;ArjZpU~*mCYF{aum0}EK%v= zMc5Y#gqcva4R27iW~6ZsWznxohn4Exue?Nw+}{YaO^!-_ku}fi&xM_ZEbw^9A|Rd-=8qD zE(3+oO(iqr^a@LAne^_-Dyu8ck#X{BJCdtB=R2URTGS9vu??I3!C;xEk6>UkEEQSM zU(U*VKUxfFOc-2P7j#69=rR}{F0K*`;iA!EO5~WNOAr$(Pb-~2@`SM*fla* zr>9RiSzT6$9=0~!MRLkZ@6MJdX9A#9HhNh&SjCYL7!Vi(k2K<3w-G!$G)M9I3_I3s zV>9ZON!J~C>pu5#Kb4T0UtyGHwSlj zi|)K+er`Hi5`4e3Rgg@Jxl_ZTb667yqaV4`BFax|AsemDu3NG0&h}eiRyYu_5#wNy zEu~Cil;6zz+DNIkmODi|Nvxg!)dN=8VVOw(E|jTH-hr%LWM+84@adf1hvf*Jzb)dJ(-C)^tLoMC8=x)U&@Gvz_SfP{=~>owy%7fLZ%!lF5a&` zyp}uYgk2@1o?_HnrvgG#&&NghdOp_o_Y9mDE@Z)NX6sK;ZY8yRTpzV|(CbapV*g8G z#6f(keDt7hNzwpnau~BM`M%v!x8+m%5;DafDlt*@mk0ER`16pw^uz_++a2GYzxpiX z*6hnfwRU{r6q%F*m#`UNMjS2;3}kdr2mEPn>P1gv1RYDi4Oj)Tn#`Sm>LG{eRL-;} z!IA`#?5C5h*P%BF^q5b~ZEOocm{wj%E3A*?A>bQ|eJYj`-g}ornt}MXqn`Volk_(I zK4vvK{9c<|jWL@Cj**T^a&C!?5g9$zUC|3h{scZCc{l*melM*E|AoK@qNpge`;&qI z-Uy?75?QF1XP3Z-j*ph3&IxZAkj-kJu0H?k$1`8oNaQ48QmIdhq#=C2Th-OC#~jf0 z$jYDJFe#m`cBJi*v7T_=!$~)uAH6TXdBmYCQ(0g4n-=PYUcE41NrH#?n2x7~L5#Nr z6eLbYT>G0kbp^f#u+kX?s3my@d4JYm=!V4I!i1JRcCZ_F&st8g2tJI>Y3_F%81t|f)_jL63Jy5vboFo+2; zc4aqxaPVw6Q-N}%NAv2j95Y1}IXg`YFPAU7k3j0!gCE9ReTcXc?H_t;{ ziPp~<<&8CQ+b5oMM1jI zjo!flh`N@S>mha|%=R525xC}5c&(~!gp06>$0KRhEceI*?FdD~oz!B6o7h58`#g-1 zw1-)n>n~e)+oL06yMe5sVrX1lY1klTS~pQc-~@Z7kcbDCBi8Sn8595fx=?1o=qV4s zIqB7r9I@A(c3?@%^ zd86TT)}z8aUY8cQq|Koqd9V)4%mg}$gZ~n1(FQr)l~n9#eX}iI-!$T(OZNbi#u*RS zjFI*|R|hSvFv*Bt%$ge&F{$q%Ijn{6Az1|2SZ1! z)1>&|J^N>Zg~GAdP7GG%@4_0q{w4Fua$Fk&tJ{2}@766-N~~BOIsnPUEYEc{M}hP$ z0_qxMkTTSlx$IypAb$&e<4#v5&v#l*%Fvhs4i@-vn{h0n=zc5p5cX9HMFHH`;@*#E z!knHm!3EZGz>YKZ_Gt`E@|zit-G3HL#{CKU{H#Y{dhHbTr7R-2UY0NS?SDHt zDp?qJc;{*Qta#*=zW4f^9+}(FOy(oB+J@>N^XS0J7^F#)+Cu*x80X7euzmO zP1}?x@-~iAUmywQZ3^POhjCLSISJD&;TxjOh7Hie*O z73Bk9udkxSn^K7HU&%|qK0#}BjyO<*b_NgOguU+M5uj2Jw5T|mLMa=|4s|GeaM6Sp7tKOZDtjUp`$=5Gy4g#47KOpCgF>#2qOVPe+ zR6N8UFMg^usplTjwT6@I+g*Qk<(}n&jQ*ER%i;?XkhVEb2Bz=l+ z#q+1^@MPvUcc-@f);)jH5Yx^6$~z33OyIcK)cwf!S2i$PH=|HPn+SrgZ={N_tN*_u?&R z3&f5jRe%-@LVO=RUXW8PSS52!9AwjIy=esfcY(XQypYTh{Z4EYT#tOLqP=L!Z@7}9 z<=t{dQ>(p{CACih5y@U8YIto_>?pR4O{!uHle@PQLN-&S_TRPQ)+EJn5PD`K8}h$nA@Dtl(3v!stuRRtJAg^Du;5;n)K-mhLAceL@e4ZNs9McsMHT* zUiUhZEKBeyt-y2v`O>|p`i|%$_CfCL;a2ulP&SS8 zMDf%TnPpOWo^`A1JarJzZ%hGRn93yYUj(R{a@d zfoZ!rTS!|@d-_Z7?(2C+T@W)VryH>V_g!PWA-Xf04xMfBaB`@931;reiE1qWzdBL$5aNJ4D#eOf%mJWB9| zf7V_TN1Zz*?F6Rn;c7L_N0;>qN!DWTIA-mJ*6QT;9c&SX#=>P?#yCpj!@uT;x`Fr_ zgiQ!-SrQ}xCDI2B;U9Lb=T|+ch;!G82Zx&?qEUdX??pX=GMb1@GTdWpcpQgGEi}TE z9Pb!9(d+aeQjUhH&tq`OH>c&0z{^Q67;!H#oVa-E_(JL+Nxpia5;<{I5ikw9UTncs z0UV;8x&s>gTtRr^{*ExrJ2uW-l~ycmoD3O6E9I}#2G#^OYQBXDs&7?Q0eib-na?rTf672B9CD+Qw7a}~<9ct57D%?JWIzg_>f^6v#=9#yklf}bs8 z=#QXj9wyct6wIPaQ^*${w`?wNQ*TDU2sg0c%+SYEdgC1T8eomp#Z4=ilez8Y+3+Mh zS(5&OpY{H$cj39dMI$KQQ3_i;kH*|97XB7NKv0N~5EFe|ABh$NHol#s3H7c+6~KI; z12m(7&x%cm^6=GpS)qhpFRAVZ+N2bLO2i^k@;s+OCM%ALY9Cv6BzbEFSFN7F=&$2M z$p~q3WBMF(k@E6Or%cc9?HEj691+?PbNuGq?PU=GTwZ;Z_jvSaSg1GrZV7!;tKFHQ zuhjLzCaLf?gkPbI%r6Aq?_cF11e{lY#skGmF$v_DF@zGFo9P7YaCS1?ZuR)W@AhWs znI9fhkb_U$7wXJQ0lj7t_liSqhZaL9&l-;qCs?d& z{ca1kE1@uKcJkzt;!7?G+8(k=2Fcn!>ki>B*#79NmtGZ^~=W(lkFRZBf5Tvv9|mQC zp|W#E7mv%2zGtdAJkNsKod!5A9pN-HJoA&&MJ2C^x5WFA=>2k`SU%;A+hnuroU_h_ zzmAYCxu6C2P2cXU*Da-X62DSRzp?uUvW)q)|K#ZGubj`Nwu-fr%?B$$x*$;S){e(K zlM4C8m4Y}%e%iiT7803lNx@KlnK`KIssnK4pyH)sC$ql};-7qPSUvuyz z!Em1f!m?!k%OJT3*6IzBE3lZnfqh6K4MwB)H6WSG;FcM+C-&LaiKGf)X&<=SG$+j7 z)mCiU8p5KSr`Ju;cdo)i;vP+X6oB)CWdT94c)7C1!i3~E^(KqI{rr)lbN<0_$LTbK z?(LhqXz^v1S-VyJ!Pjr$azRzPZY7WeBc*4QuoDmYj4km#J#ob9z95a@bziJxMt|hZ zi6NBPT*bO>5Jm>CfMd52&%|P%y@w-c|3D7p44!`wD|I&-GB4z*ah~+t8&SZ+U*c{! zgYq2kkEQX5g|9F$9Kf)obLBuSc0v5bKWOtY73IZHh^fuh+B{K`ZI$)sM5~*>J5I!7 zEv&?^)wT-DqDD=TwTUInsSKbKPVO}+F92Z!5I6x=Z+Z}#!j^pRAo?^GDS+A#BXoIX z_0S`a#f|nnO|~ip>se_|ffbk^EJu;;y?zuwz{!z={M!pG;2WI6%bT^B&SWJkV^!q4 zTt&AlKtHEF8(ow_lS+wZ{4ZqOySx>w|Nc-Bx6_U294*Y5a2$rFDit&&MzIN{X$7{J zg4KuWvZV@7Wm{_`av$gAW#?%1DbC-hkXkkQyzM&ga#%xZnTC9X&iGXlSoyo#Q{HNE zd9nEepbJL%| z5yFriikbp+heuU_6H_eoy<`YVgP66;pQB^qv$ya@>8kx-jZVeiXEFY5Ukf`H^oO-X@cc;+s;{ zo>cVpC!_kQYCGM;Kv^2B-T#-}^}H3E1kAY{>NVy1deD&D8q3cFFK}9+IGXhiLgWj+ z3`D8wx@f*=b@DZAle6v^j@v!)kU9JXiT6+^L`zrL=~-FgI&y584HiKlkKop?uT5B8 zbDDO2(!$9IY+pzThlI_lyRj0Pi1`E}W1infw2BF=_igo?kXaDa+oQM7k|}18)rkYf zY8k__)*@pb}F0cQ9~&m-O( z6mq_pm>bisL9y(@>1q;4<$^7K=^N}bz3EfuLJb`|3;>KmWb^lQe(@{|4TM3CBNfupq_o9SQ|cu< zJ;qP589TcrKem9xWAjfhMb(~ue(BL7Ur52Bu6`|=F2)yp-e@qF3_gO!x@%r> z;AgyeoF@M@4S@Mqer4&kD+n!=%GlSZQo1o~XiLwWNRn{J3x>eS<`Dl7X-BW8{!lhu z-^H=X_uY|(;Hs6YPQfYnqJBKHHnm>THk7ef7;!FxhHZwUPNN>prjma~G%Y-M9`E?K zO40#HJ3c`6%ZI2P+|haxrLoopn-W}OBicfgD|C$Ot38Gu#j|9T`y=ztwhAT{zZ0D> z${k^hxcR;z<~kW8$e6BGn&6KT2eG*C^$t-cMn&?3MiNtv%kNQUb=I@uhA|d^c&kls z17PrK8Hy-PlR+hEwEpe5iX%ON^Ev{zJ9TLxK1q|Jvo^;>K}YWVgu^Ee**F$`=8m>| zu_TcmHOAPJYgl7tI@?YY&1<#29vtL0@A=`S>apz+r6{`5u0%JX$sZg+a@0vdBe1r@%6)FhI6-F@je(#y8l80sXg33=?(Doj zcttYQ!e@sPcn%3z7iCW>TSJ1g)jO!h=i!LPWV9d7KWnUDDKDJ%C1-kld~E9D{XoT; zNGS$m-<;&#}qD|w&H9vSos zk&gM8wn%uQwa>cO>3-gn*D!8y*9RpUZ5sLJX>^Kqt+J??4C-yR`ejxil5wm=LI>ZV ztUp$me;2LQwZyIuD+RE)Kl#ky%&j^x7ihe0LUXjf+2AMhC&Eu5nr-((o}G-=qW+!? z!SD_Zv%-$3ntEc)t*{Vvfu(TDAPtY(&X}2U_JMxS%Y0^<&nseM!W#THQ8A@do^KU@ z$BaWm`WAYX1u+Ow$iV*cHF3#gI*+*wO19Mq+vh_{&_apZ zo2{#AsyJO#i-b@23wH>;9(B3ky?I%(H(~nUA)6hel`2R#@!1M3-GoWWzn^3^s;c%k zXB>lySgvGVo1luB+diOO&Q*VBXEa0M@^QF3D)zt$$ZK9^^@sdH&wA@hXUN1<%6I?H zF<8#TplHlc5ClURXz-t zQ*BhClP%2NX*+;-DDf+@cpePsNFBmwj&BxNoRM9p|(T2L=EyI5Rvknl~PDxyeSz!Dbf=)u+fjP|H2 zvRNLA;>u9??QZ0bwkGP87A@0rB3)Xp@9*&VpAe85`I)h_&-dRUOkwhxENFZ!PKuSvfY90~4th)@NO(-R{o)Z{w6fncC=~r%Iul_qPmD z9736SSG$RpL`$*lEZ1^O1B0z5+|_Pvm%^IFI2u=S(j{}NA(VIt)VI+cA-{3s0}A%K zRAeGT#(Gve%(a=(*Xc^$t$Vc_yqaG*oAQg-ygzSMLrE}2OFyaM=nL$zM7=zn$ANGp zC@w_IOEHE7;p+5(2qNoKV>mo7(E5BD=hA%jrx;14H122g69*rBYute?3^tW zgD!&~iy>i6j^&4_hz{bL>M+!XXb64^%j*0~s(uTRt#dn>Yyn(%y*#t)zTZ|jrKnI? z`R6yb1k*M;*A(L+bW|PG8rD6@HbEH29l^|k=w;VezLCo7#3c{re=4`r@2H=SBR1q{ z3Vm40c5_&3d2PGKCMU<(1KDP#1+y-U(PtXwzMj~hV4N6R^512?O_A#AC-it(VkwMp zR7?Dc%U61Fq|w?Q@1C57&ao^;Hv28Y63JVYb%14*{u`167YO?4Vo-MvqrA(z^JwpO zvy|>j_G;N%;SmzVhPpid%q(p9`&hB#ee&=q)_l*Ors9iZ-B7=mJ4s4qmM@DUo(NF9 zX&5I30zh-+pdj$$udI&O8BaB4kTwv5du6|FmIhIZd*+f8m^L_wd^R`p8`Xj=W?a)k ziP4A3eJ(QCXS^o+md)SUdERAd&&1i=N{;VG(?!QH>?(UN`J01{8{f;iLoJIuh>$T- ztSV(|J=npHXb%vEYNOJw=ZwV4KBbDG+dw+TKmp2_zPc_Bn<||M>Fv4zw`A2NpJ|t< zl`RP@iy-!-2`{!aOF=w{W`u72^F^Yn zVBVi|YJ6b4f?rJFS;pDRk_=$l;|l%>t|!0U$z+NAhgp&sXNYtXZQ@#1f%F$ujTE*H z)LIJrTvuDZ-vjdtw$=!nWSS{h8$0(D#eOz_jt~2RJ5b>?+0^Mx-e=dGgA#_hp^zWh z)F>I)N)?QJg>HM@EFv#dv}k*X#xCb_jV#tzEQTLi*9qb`G3D*VQKMi0PNE%Is<{ko zE$Y;tC;;rtE&b_xb!y{oK|KZntKsYyzSZx75^|u|2hq?jGVJaX>EAYu)an*?QM&_* zx^T+@{3jD(bLoeP++mtOzTK{X(hS##smPrJ%G-|LMbrS8mdf943MNH;j~GbIRWMU| z*^hu+HiyHg4=_XX*Yo0?boRherjRSJ$Uqx_;&@=Py^>@b^%J1@dbth3HijmssA?bt z&iM3@dS*e>a;Dgcwmm#c=9V8kZr2$zZuJjDgpt@eHW8;k#DDD`%r4cj zGAcgeBlY@~`?%UUw1R>|(<(_}E-04o=6TB|M*GtYIZ^HX4~sgBn{CKpeDFMx5E7F& z9<8HB1)tfni(+yuRO=?C2HLeM*>TXef)iyeh zz)OSAcTgD1V4OYe&L*{t^)upvM`@lnA+`?7L$Z=sFu>iq0&8p!`8~Cq*>aU%WGR#0 zEAs~XYv0S&IL+<7_iPnY^8;qLNC?0tU~%MLawh6^R=4h1OGTXFFcm!yt;i|zlNw6Q z%JEnV{l@QDOF%k#i!V^p%*tlrOeVZr_`^_?th+b#ZE3T}Wd;R)rGI)yl1tLVUUXzp z<8)~Ql5n1k8V*74Xpg?*YA7bi(p1Hb@Y(EAgGQRs0Bf*QHkoS-V?OxUZSaPzcx+d#f}f_+mD3wqxsctm$^@y5$ELdH;$cYfTgeq`-TJ%H+PsGNCb08#~tbtxV{7sZ}=c!b}8?GPW z+*iox7KE5-w;->i*Y7ac=3`-)%gLoD)+i~=cipc_VlWz?>QWgR5W)Tm2ZM+|se)3k z(|?zyvFTm%cHZuUuwOS%@`@Q#NS91`P6q{%_sL#&s?(%Gb5mmC&fY`#co7V@zoq0* zTv^a%zpp*fAP+=RR^+6<=q9fkzfbAkFStitc>&#=#!luhzP}#xNAa4()h?E2CpTGR zQTbxG`lB@+lI+wjFp^-lk2_1k@7Px%LWu#VD#0!-DZ6KFmho)Ptjr-A<(KVfuI-4$ zverSO;nTD9T@}A>d>5(|TaIPd9i1+zwfB>56qAMxQEDtNmQ)sv0zpY=sHJc@J9I@j zr9WUk)bmu>MwTG-Mn_?t+$6e3oINm0K5z~G9mQ~*9Jd})UKdCQJc1}-YMW?Yvj0hSNC`3tU=3NJ*8N?HMlQbmKhM~ySY z{lFb%evdwaImxALu**E9)lKjfd!hnaFtrv2PQXtmj=&!vd4&GYj;U_^1CoxMugSQ2 zaUw_h9PNG#KCG$p^MJw>z~cE^%*1R209})wJp-kw?t;c)^`3CYY?^F>ywp>?YCZm& z=1T^JLAh)vxla(4lwu6dXUH$<7HTftFG$_(38Ge~{F3Ia9Nd@G%Q%HX@u(ePS?3ly zXia_34B`AQrHo!T@6+bVii`y!cAKrsws)_79j)FvYR zNE1qkDR9z%l#zBizOO|x#G!OzQMT;=L5Y2*EJ#1GuPE|%S`)eB*BeJSF0h72cw;3p z#0m2g41!6jiUvIvpz%YJ=ZvYjf66C+!o_~RJv^uNS-OEEl}O6BIqF+| z_Aycg|8$z5^WQ$2$ZdBw$x^s+cgH+Axu_MP0*Q)|Y3BIl0)Y(+-Bt6}rngHhCe0IP zrlVCY%lQtxpSv0DE~lIFLIW0`JMMuM*j?*s$8AeE%AyjtfnmL3jf#$;7Wl4@yZnXm5=uj?7HYA1?Va z2P%JhN@joF&w+|zJ(p?T;et#amami_`%aucEM;Ba`>c|cn~Ldj=s9_{6kNY_3X=#t ze_v0(c=6=}wKo8?v~+T2T{W`;X9F_f!T2)RD8QCL59T=)UH zkBS4uSKqC+ttkq^Gn!mMyuKcQNy%1P*|;nEro8FJ#!Y%CgzTZD*kL?`p#O|QLls#? z9w_Z1B|?h@oTf(*N{bf46H=E9ETCo?eeE}#dpDK54Mvrq4onnB+xPIU5ZWK~W!Z2q zDxOS04}km_-<5qQF357zrXDkypdl_8`?>IwoKdG};K^li?21^$g@9`I+Kxh8o$t(r zL!QUpWwDL8s>p@!Ov7ZFihthK>aGH(e_j)8VR={KFVU^j&M{4&sk zm{sE;w9TaZv1H4sU(_$)y3&1Z1$7$Oa#Aoru+X&gKuOg>tKF9Le&lNpR=n}IbiM1q4c3W(*hq(-aOS^oG zy88XE0Ufp-z~#O$4mKN#OI>#PwDgk}>ZnMMBt1|O* zf1vc#TQ0ffS!EaWo#yF|h0TPNV}uB+7F59l#h4)_IK{mq=$OU5YQ1Vz23TvJmHFot zCf7D=Ch6xsM2~!lP&!rkG2EyJF>GB#V6E@5^~F)Hz4xx@f4HVS|CHnVQN|Mf@Pt{|EiOtAaCq0$cv~ z)Xo_q14|qLCK(YPE*@@f9xh>SK`wqCAyzJKMlLQ!n4Yq;<^QJwb~AHvv9g3^Au~sJ zD|AjZ@H<@&Sx-ktb2A9!FI&L2cJ2V!_@_;50BtKbci7edUJht7EGaPV^Q{X_Ow#RI?h(+ zfb13e4=*T^-Au1nwmB9RVO9h3$VZmxidmB$-eQ4|P~xGRPE_@6mz{|mIrCy%u~z`) zx50$)@}!z)lHcHvvS`qxq%wT~i@3A6A+HAD%yAV6MbP9Yi`wKusC`Q?0nkf#R2$A_ zgw8STZ+&LXbM{pR(M)TDeKGb`MrlUx99>*Mp!Fwm7m%UFA^%4- zI%~yVp9Sze*e;V`*%nhs^F01r|me=^*H5;(UL}ka2&mqaL0P9zCT3UWB1;E)vA? z=}Gr5e)8~R82uv~ElP)N0%tJz{7ESoo>Fqb{ z=TUkQJWcV==g0~81Xg1Xv_Crvx$<&?_kzf!`A#1yt|H4;PUIBMYgu&kJGB%*r;ooz zO<$V~Y^I>+7`neyF7}cT1envmd#v!6w*Jq8oIbsYp*D!Fpm#cIA*y~tQlvEW5}gEw&C^;6=SmgckF z9O-Lt(b@awhm2(j|Ch%DH^t>iYYC$KrrUk}JMK~9>*Cj&@_xqMiku}+FeI_;6Km=- z0a}o%TjRW}Rh4tuS>5(A^ZlHyZ{<%)lSE65AU3k;UqJgtOWJb^%Nm~irrEgnbUZ;_ z6E@jA(XKt#;s3Xg0+nYfd<=isf&YupcDJ*c~dbe`$%6s2162k zq#1pcqWUx41JDhQ+IVe5+2{91hamPyb5G`fiH{{X8jLAkNavJTDOm$!-h}9NhU4s>R>W%iY7w&BMnHc0znyd;-Gg^z`pk HWzqi!PL&Sl literal 0 HcmV?d00001 diff --git a/doc/figs/performance_benchmark_memcopy2.pdf b/doc/figs/performance_benchmark_memcopy2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6ad081352faee2b5f2dfc688eb3e33f98d715bea GIT binary patch literal 40605 zcma%?Ly#~`5T4hzZQHhO+qP|fYumPM+qP|cH-Az|F1e&nJ=O0_^{t=jt|nCw5u;_K zV}&A}UmjV9Vk2N6us5=T;^Bd!moc?7cd;N~Vr5|=_+JJ^FJ@`uV(LUdFJ@!tVk%;4 zY;R%;#m5Kb?BZlLe z5~9qSt#jjp1Hgp=sb0zZ>lzzM?O-PxMpQm#1SoZ|E-T?29#+KFR08*)>F}YUtra0kD`*4YL4i zd30SMhbT<)2J8GkFJqW}sKTaxWHShtIFZ{>`+gMazHW$YJxc-PcbYR7B;uRbSKjG3 zNMb<40Sjgt(JXs;H*Z0Ni(ChqdKGI5g?v`iSlhw0b|IUaqroPqcA8a70hC-U15Gyr zyZOYSFef)ZszXfL^gtvk@@`V6C`G}caC+GNzYBT_vnU5LPARCT(MV+-Mlpl1)s1TF zM5e&CGpL;s8~N7Gp>#@Y=bJi3>`JZYlR5=a$*`Sl>gPnJ!T%{frULeQeff!Zas77^Ebd;jGbC5v|ta`9@zI8Us>9 zt&ZjaC>aQ3+=C$@!@tD2>HQ7t9Q?MuIJmHp$OL2M!u@X>d1>;D-s--*|M)(qX0wW7 zhR~qh!%xgC?@g~?TqX!g2*W^xrPE#K`sdFA4CkjIrX^VA8H16?oK&Pq$j?(x(lJng zoLMlf@`Z*`gUMfESU8tg(4tb^M6em#W@xQQW--no)DL_m=Hva?RDvy)NGT*0!@iLBu8}nykp8M~$ z(uu9ya}ZwAObQe*XA#g;8LEXugNd~{CPC8Vm#hFq?ZPxN3oud{qRF&$xqal)4B-NX zF%>B}mFku-Uhz|^+p`4N5w)w(>6!eH3g)-`B2GYg3o;k!lO|xT68IS z5K|cgv983WAT1%we9!DB6PN;vv~MjeP){Iim-20PT&*E=Z3ARMOyr~GB}+wQF2{0X z%quoAI8N!2HO5031nU_!F-&BP2U4+=GE6bk>Mab$$eF%hDKV_7>f|j?WM}?} zttXbuwqMQgy~7NP9lZ};0E7Twi`%$^`Tfa295%A`A-5#lM<_0rexR>#RA@8 zcRPlrRX}5~t3E}@Bam$jR!3n>!8`-ieu!qjSu1kv=*Ku*8jwc>%7s;JS~7%XhKdjf zgpnT1uP(5|EDK`CobYS*uNv7|!HV}2TMxH%6qwHfC#Ap^do@1k5amqTaq>u4-$oV~C7%e3|{#-#A7pD5`lXOxC=(a|~Kj zHCl+vJV*w$T6{_elC@lLK&=>e( z-MVCHCPQag);ni;Lt`vLwf9$b_X=}GBvGY_fvCb(Wyl;7X(mgLgvL4EtE}0QC7PDoSiLlaCx(bX+Jq;cW8b3Ww zY+LSiH)Cb4Jbg6WbpI?y>KuK-bmye$MuaJh(?plqsAXz$PPMpV9Hs!xrL#O0E;Z-C zS%?#((q~TFCMPt#=*Au=nv%Td2O-(ZrMdeK&CNIJb#%y@AEV;KrSk34XxF;O{wHD? z*Kk_`D@?~^dLI6N`O%O+lfMt2&rc&4KjH0F9U5QqkvlU}UAKWs-kw|IBuK`FjDPu} zNdL$pzFO2a%U;aN4>gEzdJSSwi~?=MOce~imM$+pU*jZRjz7)fvUXga=~XhGy87-k z8@jkU`g%tguX1H%blm0)h&mdK+%@#hbI6%y0!t+rY86;A`a6wqQ+;TR$El|6s);Dj zh{zSc%oM(qF8Ab&PN}{?>A{ACLZ^K1T0QAKe#hMqv^8wjea$}Cz zLDYFA0F$|r?Br}%#8}yV@-0o0Dh7QCpXvq&lV-?G;~;5iDIlKZb?BO3F;fHyy<_>d zYh&f_^jsquMcX11z`7AIjU7}-pb2YZSW<#&jM!z7}GyaHCud(_*(D>NV z_#+?v+RvayVr^CzE*X!OB?%C$0EaMbT zqxY@KqyGVQ_d)ObYu^R_^mDzie`dP|A2j{b(5=Hx>Z5@@iV^D7deY`8d^nOvB!MUb z_y3j0lIuYpm}R}|^ht&FI}492R?_tF^>uZ2?C6|n&fwyM&uU8JchK&u88k`aD;iWi z+7szE`2Ib++q>@Lj%dS}tF>3Vs>*V?y?lQ?9;)KD0s8as_jbH&k$aWzJF9bdr_QB& z-4it(E1btsZHH>_#XE>?3)isP)!)X~&jp_}#_|#F4)eRM<>Q=T-%6L`cPppSHs!ja zK{r<)%W2BOd%O5q8L3WScU;=J=huBrt64+e;pHd%v8%P0TYXEA;=;UDz>$I58aR?r zBlp#atvcGPb3VW0!`Z8C40C@gt7Bl)fIHiu=kJ9>rJuVy1@+D7TXJ9RjZUUwqSS%^ zxqa=6Pid8dGPN`L-{|+B_dl_bk@^4OM@9lBCI+_uLz7Gdj2vvt|M&Qx{QpD<7&$pv zIsZR7Gr9%X1^053c3UIn26d~|c1u&yc*|;W-3?1SCT6sCXVlH^b>obC=j`i`@fH`; z*mP#>X`pL70!8&aMCK+s@X)Pw)1%@&q#yG zHNPmSB!~8mA4UWxP;3^TNs;&ro|}_fJD@p=H6RkSe|UCgcwlB6!obYr_!D24Tih<7 zJfW(DQXm$Oz}6hvC5RNcq3O+`nWfF)SK^6ZA269r1%Ql>4sO%G131JcAU2gutQCNh z*c>{4MqCV;*gD`}P9*94#!;V;!lM=k_m(p=LwiTZ0`>;S!WL)dlkx)4HcbvKz@z|q zei8LN%00SaKz9P{y}j9Uy$OISv~*eD36-1Ktese_K!14u<|2s2)$_rDku{VHhz|*P zxx@qz%5`9?KlaqGtpN19yFGx+v&bCykjkm65qIaI>ph&`%YgqO^Gag!mRO#&7O5x;8hvcAY;kv$VEyQ=de7 z=Q?xwh89;lK#9no?IU5(4>2=17w|d;CMIXQ27mxK!2Hf|&_UgOCEjblKV`-*l0U=! zLt|SLc)D*qz^6tgAph@x7bj*15a4VqodABFKh-bx023pi46ST7;HZGJwDvmx0{^f; zQ{ThbJudAIAXhYg*BFSw_thQ$-ZS6o)bi}==26^7-|197NpS%ovD_2;(cgJU=*>2u zo=bH!0GAn?7{HRg^ytIG$XVYI1(xP-SG51T)JEn8VEb?FYmw}qa>G?0dcfn}m>}r; zoyMaZzBV+7!mp;2t{9tGQXG8g-uWFb2sV0UAK?GX5}_LHtKBpV8!xU^W2Q z#~*>57(nnL{xhQT2mEJ5#UpqF5QfTMejJRI|2#PO3;#KB-YEWag8UPI#Feo5bNHUP z4DlKNB)sww|9Notgg?Sn_{EsB+0)oZogriTFR))T_&fePBXn3F`w9QV?;>}*56!#3 z4If0NX6LV?(Wmr1{Es;I^tBGoU&7H)|2O`95k(X~so(SP;Lz^jQOB?G17XuQ__u+@ z5B$5}!YTZ>2MfmU!PkQn33*zeKv&2MwuOK?U{OVkqop?~}rxN9jqnSZm5KVkkAe|e*CI=63k`XA&wqb0dD zF?p1K$h*W>Swvh}#K6WN;(J_%uY-eI(^vQqxOp3Y{KwGH(SKy}4G9f^Zy|Gk9yW#9en`~MID1>OoEub+T&Bj*}ab?(20V-f1Gcm6>3YBu`Ds8Dg$ znU2Awk%N?zR9;T^%Z_mu6es8vx9BoXY7d|es@LuDW91KOAt!I<_UOYLCYQZ!8Kjjm zfRRJ{L$AQ^5A82hK;e1xad}jM`v86cE|SWoUhNM?=BLMf?yu4x=yvnlgf8l86IRQMR(7@vYNNuCsKc-77Y-oEEA}E^O{6e!=e&4dQrgCbM#pd=j5V zTS=ctY@POMtu>s;mG*3t2rz)&5?tyD*;+DZ1`;;_dAVi@l2kg=PN5V0Dwbv3dwu12 z#bu~M?Lw`|{BRq<=}jHB`Af&GIR_7Yj?W2pe$6MZh9{inb@B3H?SX=U+)wYMmygi6 z5Ur|pFyAW^OlFS>cCiL1Sx7Wf*Ggv7BZv`p>jkC_%5%tYhek(lW+V6#7<-u8bnK=r z@TJ6{KjvsVk4LTAQFsa730^ zplhdKJQtSWg)}~J+eoJg7mHRaA^CnqsS+c02I8s3>3}|)U#3{uHrRkliFlJIxEzVc z01gcb68`A8mx^<{pPUSSo{za*#-XR!9SjmLY*pe6c!0P4%pY5;K}-T&(u{2qTWJD*TL>KkRs^uy-``NwaYAN?V@lRC?pK z_JX^UWQib#cJWiJRr?cR*;5ZphVxW{N`fK&m8CdT?3 zjwH#iPbP8}ZjAuD@g*eOvKnKSzK>)m4XvA$Y_kKtNo6n2mT%LB4slUdA<(%&pI#n- z_EwsW?h?sOyiwq^^8$IzHD%opGGbdeWT~ckBA?dExpq0u+9BenDzrBv%nY0-6CY2Q z&vWxUqnr=BGD@URQAE`oevAmZXtz$FjBnd54M8YIthyjf{kjqbU$<;0)?~N zoNtR{j|zxvSjOhM$!WV;KkY8~aHMoL9zHZ8X7^}%T*vQhXecEUHUQaD*v};XUms73UgG&>&3ifyJ}yHQ@xXg+#;4_rWulWAqnW#~BFZD6Sz^Q@$)Ua~ zHTS*csfR$@^^t`bcVWk!{c+XShSxyV(2aCqg1Ifxda|Cqs?whahnFF8#9)7zsyi;m zKXGctpdK~S=~-uQKtgxH{{+a7<%A=?7^YeGc{7OAtYp?=dbVcL=o2gVvS1|joAkg8 zjk182HgWGq+V`$jA`%%_2zh0sW|*xkQ)!T?z9k=wPu(n~+%P)~$Tz7>;y-Q(F3s{D1klHjl)pJ7c|L$#cs$ktG7 z*G@;FSu}7X=Q(sCNMy~a*c@s3y6GA#p1rr#NSCjFJ-tYdVC)sAo|}D2cz$-g^c&Bs z?rXI60@<phKt z{pjKJkfVt$rpg^o#PlU!yxPh+v;pKEZA{KC9e|6LTWfncoa>iOPIJI*eE8MH1}p|BE-jGbuz;wq z6s8OLFXKV;Ufulx=tr66Gv-y`#Yb^Edle^Z!qioL`dncreM3Nhi(}gSwFe-bo|_~* ztRx)<+M{8A%_NL(LVZwZ6fhJZRBl2Sb*t7!7bJsh??1TI0j#i3W&QDCMh-v)?D+Yz z7^xZ!R%1SaCOhA7l%u!!;8Oz)b6@y#&bTwh%+`zEM_&boRnlVN0YD~&MIx$ObRVV!Zv547=#?YT zl7zf0v9b)<8;^43eb+w-TKhKy8hIWjjZQB6A$Yp;83(}Z!(#L13?PjHt^%cyni;o=U?B%MTOMTa zAb?K+$-7kB;o1do#6!l{`{hDw@y{iMSu$yKwO-EXxYrwws+0*F zhZ@?Tz>+z>O*C~BCCdhUWXbBN6m@lu*&abXx}Y`SMV8U4O}cxpRpGtUe9yNxca;Pp zd5Ig=))k+((rOW<5Xs2@%$k-4k-U*+=au%3htG&kE(aXXKBBn%WlGn>A@~=DglK<~ z`Qs|W3^ujsntZA)eEC2j=|?^{O77qdaBF0zHFF1GMQeebF6&N@Y*8Qr(-WCt6#8Y` zH>@x&%7@h^GZ5;Tl&vrHF~{H>i)*263G&PgF8bAtw~V_M;+_nnf(aS6YfA$7AGQ}h zjML_6xG-I_cylMO%TPdAqCIme6q&F`agF&V*pv!Fj!>G?o;3mWzMII{;UZPR`8m#^??6Lt)y%VQ zh-;xZRg3Y;Qu(5zWWoJ8Wn03qfwf8MFE)K**w3d-0yVsXa~3aKG^EBpdFU}2#%5Qi zf9^BEuz%kCeH`InO|GGv57uL)Qkeq&VBCK}iaqRfrt9ih^@^X%bQnsOx(vUxAb$2c zO%7S*_V6{W>VXu?SshjP`dhkq9i#)E(U_;6H3o#sln@DFScF~z$=4dybOtIRXxEMT7os&&4ZkI(O`-Lgj?`eYG|hKk^5ID@t|NKzlFpWaE0%r_{& z=Xf14YzE;~dG?_fb`n^k!mqgrL649L-=Oqxr(7U*Ljl`Fx`YMA7| zPt|m0ej0Cw?AK%GVg$e)Wx-U|P3bDGUMT8tx>m7yfFmF8fMo%>Po_e4vS=($4Da9g zP+r(*GaT!H%h`I_vqTY@e4~!GI;M`s)_D7Qqh}xARh}}UJ4W`_gXQzIZ-aWn0|dp| zS%cJoxK?+ZF;K~1Vr{?~H*P+}@NnHm(E*nG=diIGlz0lSBr!2_7an0es}!1Y!{9gc z^gN~IkrpJ~dVUf+Apyot-f9ilGk>20Mc*|m^oZcnc;7maKna8ZhK=zFMp$)f@etFy zff3BeqZkIls$53OG-!y8e%=3Dn#($P*^Z|1J4}2r&)PBUs^pjT1Xv6 zfk%OA^Kb5x->%~=d!WR+2E@OhBW(>lQjtmO`#8fwRA1EzAchWC>bp?t;fM%3l01uj z6>!bXa*nirLrX1$Qm_fXpDi{C=nA22{f-&jGuuSglXFV~N_HL!M{(1TBC160gI`K5-=7DH5mOx@1%FTKXh zHB#y60_&Xrx{ESUL>X8HIfl%dH0S&CS%L+e2$Z+1MV~1vm{!=$M2SRg$4Tfudw{%~ zhpZ(0st#zv1)iWI*H*Mu0AooJDy4;Dsc;MaS=G_!iNOeXoYG1%c~<@b!68um;36RbcF?;$QO_APx`Zif7x`_P!3NI?ZG10<>4T)4NHTZ&3vL0=9}Lc= zQkr=T=!vubWi*Vs8q&EGhgFZ*wsC}ZUvA;IL#vpDFO z`+3}hrC`8VnPg5;V`n%;e>>JC${J5dH((bFlIDU*1@n+9G$K?0KpV@x9%_}^Z|lNB zcCN%gh*S+trc+Tgm>*XwGd@pJ{M^#)4D;3U(;3<7w=hEWvV7O|@hqrY84wx+0FJQ1;D(O4ko|C(xk^?PUzYWv* zi*yvz&t3GEx;oS!A+{IFKgX0Ey@l2WQ|mZ%A(~i_={O*1}OYI)gTizEvjG%`j}9sK>8$;YFPElniIh_(T2_n1FN0 z+$mlxx{JB*`1p<^L+WrE7wpd+uwL?$a|b$=K_fDh_KHxO3#Eg-uHH5G6-Hk_rr^Sp z2$yJjFYhs2$RyYX+C$Zn}tXi5>v>2xtf>o=wfyCgAud*~p$ zMGpA|T$PrIU5z%j8j8%=H7MKL$fu#-iNQdb(%LF9>poKh_FIIAf~jBvidc4!5lF(% zL9KU1;m`bEDDGQK;nuO|rh)}gJgEl;pAfkh5s<_KnX_~aH^Dk=Ir!m7M8MlKo;lzt zYRw`885I1gJl1gHpgB;)FFs(u+X(P8ELj8v%db{LxrK`8)@2Aih?P~K&63=;1WA5H zq7Q92PI<_i3zM>maOm8DSyPw~z%W=+lb*1N5A!QsVmVWwn%W}U;LlEa{feUnbiEWm zyR>oatkC)DR(CrP^DV?75n6xsV9U4cUba6|KD%ag+^a=`T<)CFt?8bwfZz^^Mm#6b){p-K)P-&?rvfb)@7H2G*GvCLT3^3OY z4dv@&Z>B|tpE}#Z+X_A-s|!_BxR6^^+8`^7ZsG{%Dpe1k#|+Z~;)^+!6Yv9F{>^+v zHHCM(q?4yt$qn5r6$h1;_`bwyBh`u0?j|`Oer}ZUcNR%Ucc;C#bMjcrN;<+W0cq!m zgAhH!2etN+xp}3_`FU_YCvc+&TY!ec2gR)XL1vBSWzBo$DP9CpUg9$eJIVOHfB)2n z1|8&Ud9(j~8U2rU+_BqTE9t0s?{XG5DM(M{ssP-=MH{plJypSiV4uScInN) zkGuI*<~(KRit+kiRf{vQb;*(B40TgF;*n^1J|aP>Of5p}XnB_4RZvfKJY-Q0+5xKG zDi4r8I8O_IxN}gT&g8m|YVAT*vPIFxfQ|>isWssr7S5nCRy|95wNekIcPa3P9pS+&Fyw{L8rcA>aLfPo-MVp*VwWRVaPb@2gAJO zmJ=|v*xX4#&Ag8m)EhtwinM@|xKVQG4mR%(wQ&EU-y*sOo6jTuei+eNQHtS5Z;0s{ z)secKJEOIr&J9cl#mLnFO!#Qtj zm#UG7@>j0S57Fl1gzf)h+RC}!zwmjG%I;YnYy0 zZSc{@L*ykK%jK@cC<|`}fy>n^ny>CEuij%AC)xG7IEmmD+^@qLB12YTKRjwSSlVks z0m^<}ZF^t&V&QZY2wqzal|H|nV7oGyJ=sUxC+F zcXxrEW(6Z4T&uLIm;}_@A*jZKqgWvBBfSp0Af80-RDHelNOMRFMLOmpbAZT{f;7Nt z0s(&?SkeR(Q15*itEEpE+aqQ&S%0Vy2R=S;NMCD>QgKZ!7OM`KBB=oKV*FgkYq%;u z6CIhuOfM@*<@9WzhMaA6V;Mca5*e2tF)>1WSppC3zcWE7`cdpYAv0R_d}J{81~4t; zpJnjoaa}xGXAogxIzH9BzH7^c@t1UxbV4u0Z9z8{;>b#`J*SB8hT3DzCmOt?934;> zQyJh!s_I=rR$oH8 z%W~Q@$xIXDU%By90<>Wo8zNmE@$Nm3KtxZ=ThUMwZ`TLQ%0V;!;}p+jMb^EZ0X{xw zG_7KhQy^Tdl*AM!9yHt<+yG`wDGnuBiSNUFM!>hrF}=NVErsC!h_SzNhWGkZN<+~D~MRwMcEWxU7T(=H6L?}}zI`RAf5d2TlC==eR; z(OigtVSRl1cMVNkOyiC&{8S0c=NZusRinhWrHSlRe$uuqu$3S5-B zj&l59Q8Tw*1A=f@4`kKYpQJ<8!%?LWZ_jRSI~R=$TTQia75X= zp}tqpJjkGZ)8#esOafj!;^^N>UyfPq{yIObckVNdsJTO~6N`_0aoA{+TW`icBIVZ| zQIui;&7ibNcAx+RsedG7Pvc~Tx{;J0u24tcP-I?+wC@sV#hv=w<{ObdV>^&N$fDWm z;#*P1YvLbEwvzQ;V7-qTgBf9r=hW}y{uH$moukZnu6eF^BrUpwIt5$hzO%qf?UNox zrCj$6)R_)g*XmB{H5DFs5+R38g@_B$Mxw);WH8h}sQA8pIIr}%c>78G^SK+|X&KF) zy%^8+2Y6L&k7U<@OSzAkm3EIT_0lf5iB+75`Q#b#Gmfi*|n5|S`&J=4wvWZrk!1ZINI)*u!aT= z;88dNAcBCZ{L#@%P^({|I^O7|bNO4rdN{cpO1(l@!T4mN^ZcpetS8y+QM%RB2T{6R z=G{IT&*P&eJw*m9d=vLSR(hq7O!=Th8L{BQuz@*dE#qbDgnLjkwM97AsXq*&E|V%J zXP;tZb_(1eNT>$}CmnQXy zqkRRUiZ88qN@wj^d(1<*z0RRXie3I~YEMS+xG>MWj<4yka(&ToAri4+QpeqJR z*{(5jZl)i}i_vR#Q1T zI`VO6CA(mx>APnUx|;Eh2viZV$fl1ZN&I!F4LiML5TYK!=-E+adGguGhAWjeoGKn1 z3D@RB+YBalms0h}miDb1$%~%G^dN@aYNR*;WHWZPUjg0z71Iu6Y5o2wY0_%U1ALme zeB2qmQ8m3aRkO|sz<0y@6LM+{?6>Wg0XYJn2{Eom$WGljG&1Bk@09U#I$+cXL8Kg; zULi5tXc>JgH7afq<}`W3@)4a&4!;?jLjUy>#_ZE)7tE(&M5$Lvv}+;l<;0u(n(~pd zb5`^1^cABnO2(M!P}>{(rE6janCka)gA9X;C$iVTWXiGFOf`B`ACJ$=_ zD@geJt&Jv?OaaJcMR{?CVIUNXG7eqp*<8Y{b>0@Uo0JCerj8kBM*>p7T8p( zs`)j@xOJYd9l(7EB;M8F*$Nb+#Fm;mOw9VJh6EvI%ojRDmLw3uOCb0}*6{1*IK3Fu z5}ig^N{@64y~q}vhE?ftxpSrSj53I~n;m&()~POos&gq0XKqLjHoTGDAhhT!@+h4t zLXi^E+YsMN7e+@vto<;g>o^I&3RJ932mA%4rmG;%g=sL{)Se@DM}TB_GM%Y$9Ezt# zqY?qzU{V=R_H4UR z?KbI9jSYxON$E*>$AA8tEbN;fSDbL395jMx3U9!kDQQDQClBvefqbnzznsKkmazD|{_)n%kk z+{?lH?nw<7#Y_XA`=5RjRD!re2=OD7otgjZi2s~c^+c9$^>UWO#^eY3$bmtDj@I^h z`m(J%kS6;iKwL%HcjagZrx690=kRsX>y*`h@Q|4?CWE$a_yIxsNQ;PG2$&$v2113ao8hxUWd5x-alInS3;TFX+{gHN7uHBR>q#XwxjKzai0qOxC;r~ zQ;Lv^Ta#6+l^HMwuY*Kwwbr;>w&cuoz)NSGejZ)m=;KVs)7Hg5w2M^Fzrfi?=JH`x zaPS#WHu<9^!#39Q_ET$n&BCN+!GVexiB`N$PK#JwnA44}^>`O3u1PW40)iGX;4o{= zHo;a=R&0Rw@kY#F_!6BKiIPOg%ESxNf3|(*+#*3qJdLOt3&trQbmS!CJ-P)IQHZB_UZ3xk_noZ?k(kAB6;z2`crHp3E$pf;)Q2aNDPzo)N%8~xn zVc<*IM&xtzeDmTH&J>@LxNT_Fwdd8Eta9R0I*x^X`K;t<#AfL zA4?BXF_YVFQy-Q_rtfN2hdIm;d9L*R5R%F64&|f1)snw56%U0$EZgXLz6eh_@>vZu zriNMO2FG@yQ+s#1L>5&JsJ~?6=d9DGD4cW{rG$+heDh;!1*?N8bQ7U5)MM}N^Q7dq zY(bMq#2Zm!m%DMAHszrWEOdfR;8T9lAg^u|c$HgbPXrR%6AXz5F3naB+%fQ_Y4i`# z?>=1ilzEu>7xUa3W1`JH|ElN*Epi;T=4_TuWeETgm*(#cn|caoF^mOMT)N=fX7QPAT3%1I!oweN-dDI&c*&m{53gMlp~H z2FYVZUX7`{+4aL+n!3J+KA7Cm4@|RsYSN>x*e_BdVh*V}hnx_!q4a$;-Zx=EV+s7I z?);8x@Roa&TWtEvO40cn^fIkDU&`{|o=Drd+Sc-y$xIzwXQ5)atILou&|vQlrTkbA zDoe#U6H4UlKZuNqqVcF+NdF3q*k+_IxtocRY+<5IVikx=FJS~=F=3rE5r^|l4&PSi zsV&30tgQC>{YOP-8yI6J zJ|9EOW7Q&P{!ns+wE`!)~= zqOd)oej&1_6Q9ymT63!XWHkrBC70A>cK9q%?`{qTq(}DU)VE$d?Y{3u>J8vEX+X|& zn{AxW0(mq?AmUB44cphohk%)AwK5x0kapmK!RRZPxM7!VGR8dI~Ob&k82GZ9bWk0L-`IJ1!V9bK8*vXQE*64u+FklSXlT#!MiI(W+q6QCR z0w5b%24LV-L0X1GbBg#t%J0Y9yY$O6e+YyuuaL-2F z_@GSuftj}04sE978@9-oFw(JNS8&rjXs=7aQI&9?vtMvnvzpnTtQb8EJG+ubQu5EM z?q}+lO)5o7)hEA<>2N~cc8ZaNZ%rgXtTj)AkLkzy=scrG04 zJn2t_;|GBqckd^$DAB{K6nK(G@#ve~w$Ut~Z8KZGou!tEW>0;@`D=MoOTUG>8a8^9 zOzL9W1V#xA@DO%zX@g>S!xg_Q56oudc#*$2(lz}(cpr}C<+pPi>80YcLB)3^%Aetm z!GkAxG(XPyuPeK+4#H_P*`ihYb_Ut8%olhCv?$Wm}1zcU;y>!mXUme zxd)3WScZRCp8naPzW?wjx~!BcTGl^m#64hNsC)#cwJB^YyDcwn?1I8|uk}?f;Ef7b zhW8IIL`FrmBFjMhcsg;>hiRVK&8&=xzFPpG_z&|6r}LPa0pR9IuQ@Hg+SZIk zIQNG^fgl>0EtAB!w^lk@eur9t)$-~E0rr9j2c%JlzZyRW$8bQ}^gKo7;)Z#5QN~al zu#yol*%nn}D7o8RlV4oZd7f)EJ=gk@(jDB`$^nTpe4zc;slw}1fvb-8+0^n2g`Np>f?P=Mo(ZwW_PAz>?oP}$>M z4Hq{&_;x5^bb5I)TPp;zVh1FzoASY0^r%%ww@Z~9rB<28c0_}6ee({(SIsRAONcYw zt|*YiX@2CaYPoAuN8^xJ>zTqN1-UWRpvSt~(=#%3%d z)Q}c!jT0*t_vU~FOdSCeY=i)^_%nf?1~S8S{syE^yV7>HH_D+hirUC;B8Y~ zxO7_VFz6Zl4$EwkU;(Ueq(1{Ikvbi&DK*3)%dZ?PXPMcx0e8hhf{3|&q0v5}?s-<% zA+@Xgs6ZILD%x2@g={xNhG7~&RkhoqDp7UNg(NgIX*PD*w`jhS&YFfH-uzQX5&0h} zcI_dewbNQi@L4rc*e#TUbJcPw9k+X4TBGEoDfRsLkYHYBbF1E`?VSci{;Q6_Yr z-re+MXz%bp%>E(qLbM(5OzY?mV{P#yqxOu%^i>!`d0yt}-NRrUD`WK&(Xmij{t@II zF7liUo;lSPR5i{045-VV7Glq)SGnk6+M`C|jwy-4HV+C@zFp7ve?f>&xoA9*<3fEV zt!H!3mb;G`NU7ldoEdMB{DFgN$m`}_CcN93Y%b6=*IBj5+q4eILD$!67htwQzCWRk z%0SMduyVZ4-62WB9O(DdrG;uF_*u;H3_dZ6Q6A4V3wbi$JVY-_{rR(73`%O`73I# z&Y;JS%^#qQ3={lz!0STJmj2sp+C-`@+AUKbaH>_0kz=>ft-;I(>yrQFpl}K zM4~$`G_HalobxK^Kcm~t#c5zmp4=((^yUU?`r_Nw$%m(u6~<) zr;N+n-Ivqq{oL$axN@-_dy;|5EVJ%$CC33CD-M;c#=^!Y-P5pdE=K!Q)X}SdWZc2x zht96+F~_0fGv~DzC6!Zf+oZ5}YCQk+4;J{NyCymnTQZdpyXHaS%gT&&#N3bK&I9fLytzACs+>5X3=1re7!28keV>v$ANw=5osJ&h0q|t^4PJ4`hHA5nQhOGhMb^3!wcC9#p;z}42GWW4bX(*oWy?c z*pSv@k}*;@0=4Jawv>O@DM5zXH;E;@Lvr>yc38luC8FItL4xL+z%RU6G01ghzs&`$#V$ zpQMr$7ST9LKjD5TsxeddnaQqDM<4(8p7gD@_vqt;5_!}!ckoXAed)MsI^3AK%E#T? z9zVN^g)qi8>Yi(CCb(Q+`j{}jv=4_gJFbfEL=@}^As{JgmW}9D5YTKTy!Yinx!LJ=BO7i^cc%*R(d;D2>58B^<9oI z8&aDT+=JoV{=@q|+7Bxsxhz{(%JT8Uwej240_Qd)1G66CE>AW&)f#@#$VmD-p zh|p1T$)LR#5GYz=egZrNg8Jx(P&&UaH8_LM^N+9aM1%#Qt8O+EKsdXWE7?mC=>9ry zvaO{mALO#}OMuY?a=h4l!N1ugVLyMc)zA z9QfsyVAC1IEZ0!x(cZvje@w^=ksjl zo?djmeFba9efDxXd8zRZ^-r-p)M!BP_>|aT8fBR`IenW`J1o_5@pYyjcr(LsPizW1 z%d^s@5RyDiCcgBvSp4poVt>Plk?SnJdwMnXrl%Ed7m@ajir70#hQjU?1-Z;pbm8{U z_sY+&tTZR`D!32c!nPW)GFB&JxLkuG@6;M`G*O9G1e1Kp^ic%N@;j1UM!s*Tr9_)Y zZVT7fAch%T`ZXsjJLAGOD#?s$_}S~lq|*RsorRbBZL!1gke5SJi&v@R)TRFqM$Q>Z z7%fb;ZQHhO+qV6;ZQHhO+qP}nw%v1+3Eo;0ypw(D39D*vIV?M|fob#il!Px@yE^mV zhHGWaSM(u+@$j+P%1dWSUj6`sg)TYsRSr)m`;5iC!u-6c%=Hq|AZN(}Sd`wwm-?-= zB+-~LjSiB1z3b*LhkF;-(j*SbQgjnWiRr9@6$mpA#f+1K`HT*61KXj6$n4)+4_*79 zQ?)Hrfjd*ViuvA0U90F0oA1MulndYT?NQMt#SGS2A0t77wq;idK`VklZ8Lg zkS~Z@;ze&he}&N{%mV)NHYj*dPg`4CuzHp1<79`W;G1~gQWy+M$)dfyG{C zDVRezi#aMARynuR1lt}VfibsDiOC;q!vLBLYu`Zbg&RR&ZA97UK zpx>S*E0W2FX0rBMn%x(X1>2V}d#M7{k9PH2jkLjDoDg1~WGh1%(;N^O&J4bRsl+MJ zAx5Zk-6?81gwB3}ad{E)!uJL`IRfiQ4i9DX5`$i`JF2*YUGGiz*-}Fc zF{8ZNN0AGU^^=M8LW7l5UHXz4;);nO3eeG5kz{nKAB3IttC$B7;19J`UZaa=g9}@f zdCcf-KD0W50(JdE1N7 z0kuAvIW^o;pK=%UOgHYpoQfesjE&rT*C8a^VokP_=Aneq2GJB^bqYn$oWP@CkY_6@ zY$Su6yo|KIkt_JmH^^S70*H{0J07&iW8D|L<|-4SD~M;R?l}@0?d#^<>+;po-$NZ2 zyia-m^Gt>yqKUhBkifr zvrH9BAhN(7hG{~R#qq3CCnLn$l1- z21_=DooiBica+>Ab|`N+MB`o@ld5y6n@IJxXU;QHioSHOdyiTOXv1zU9ot`8c8-+l zi~-^f?xs-Xb(1|C|7kksHLv!gd-kBob}-qhVBNW>XB*jh?G{ZX$=oZR?+JXeOt}=Q z-Zj?XROj#YO*o?fJmzyk(}0D8Jg)m16K0vK1eqFoRQZDpT6j_T!cw(wMn_(R*(>gq zs0(g9RV|WQQyXT30?UKW-M>KU=VSh$%ok>4tiEMa{Nc2`kc8OWO8I$1*p)cFf= zTO}j_OVfsRGCww_6-MfvZK}*X0MohL%dGQ6Z3Qvs+D9cNAIR?fb4rC6*F%ozQsfEZ zrD2i!iOC&*vM=}!>u7{|zksEByRFhv7L%9u4sl)QU$g%Bw$R3jhWfn!+ZWLQE zA4j&_g`IDD+SQHZd6K||qD{4GpugoEir2b zG*NPbcG_DM|3bBeYrMk%+u%6_5t}9}bj($Gw%np8avGM6@-J{^MuZ8IzW9yGMy^X8 za6r@{L3~or`wFpvM!wOeI59xxKf~UIDXPB^vOyQwW2!>Y^XO25uy(C((kuwg6Hu|q#|Qfl{5kH^QXPJxXD+Jw9FVIM&+d-v zwQWbd>>7Y|4#9vtDA1&tw1qm|!vi^EO&3=bI)!^UL#r+MGj~(lgvf>W#dMicMJSNh zp^Bh54_?tcr3t+AqAH@2ZFVbAI<{K%F$7{w5MxXHN%7X@k^(WN{uiS|^wHawu#;`> z2QZsNJct@JfvVgrhR=>${>xba)KIH?&vlA~$dw*6ce2ULd&?<|;cmEJb*Wen1EOwk z+N>lz2Yjr+f%mAWXw(Xd*ixD^n4K0nWomEtQU7Ugo5)jL!9Y5pxO{80=*Q)}JTGkK zygoy;$cuPm@b4TW@B_qqhIAz46R*mM^k8koJ&AF&i~}^z4Rq(tsF~@e+1C3=!no?k z!n8AC6&aQwCEoyPxC%3AH1PbxUwkn4K7%#qu@*2qPwz_l=7ZxNj~oSR6`K?h^pe4|$Dw`!fKD)d_WgAs0ppvU-g=>6o9 zRe6VfmqBikG>3!G*D2^}eU`o_&c-uF9oIBgt&&=Q*|r|;LkozwCa&P3w?2J?#>+1? z0zn`bElT|0pFDv_k%izRKi!!?h!^%g-yM&>#Jz}^}B zckBYo>V6qbS3B1Wunv<+P?Xm%mUmJW3!42(gqih8+z3GwbC;#VVBr)KIm~gk;X8sU z7oG5oqgdR22vqRQJjY~X>PG!irYihE`g%wEo%aN@ooX49_uLT{3;9aK1!VdZ{=L45CAR`ar zJeO~L(`8TLLpQ|oob5Bmg48+Oju`}=vaRkdskW#$QbyohYK%%m%(ss;7n6#evc^SD zL&^jtd6qjQv06s2jfhTX3^HtVu8n30^+Im>C@p>U&)wJ7$WO_v20z^Sf(zTnn!>@P zq~y4o*rh)LR6atRPU;r{`oE4;53Ow`+=B*zr%PJ8l?91}eCm5cS_Tt0aY?8*6ncJ} znH01J&$_9i4gr%T{HUeYOI{}Jcop!%4XW)cjGZ$zZFTU$7~&4OR2K`_mV27Pw##yE#x1_h#_8tjcqi7G-7fXRWdjhiZ=FQWWu>4yTDI#IhDGsHKVbnPnbuSbfE)lSYO=QEBiuR+wZ z)|`sP1PmRQ#!gc&aciv4)VFe@Mww_03FZzswzDw}+shNy6 zbU1M3Zch25EGMCEU>yEo@RBj~se;W}6y=aH@iUO7)$$hhCvjYt<%|WRBkhQjnJ}#$ zT-%y10;dhzU9x+6W*ET_sOJfLU}thf`!1nf<~Geh8uEC1qQ2p2ieuSx+itV`;h0%! z;=E|C`%?0NcNuC1t_>pG5b-HofMdL}y(pF<{)xCu)B-Q}4=2TYpRDBgw zL))K3i7K*;sjYf&!2}pEAsxRpESg5P=8=N~0ymOVEQ$bNjl86qt-?mM;*B-( zUPB1)hc90cv)f1vvQQ#SE|6B-&9{jiD$VAcNcJJJb}M)8V|M20a?y^7{?Ki(qouA^ ztUaDh%PREoQRy|}k>8gh1me_?R^@>0FL=HG#rl=AU; zd1?+Nw+e%f8`FRsjun5v%Mo{RZelS+u-eaa^c&%yK55v5&V_$=L@!&Li0D*vQT(e% zs|$_);6m3s=|ydJDZ=`|;hC><%|l5~4XCS4De($cKS(^n@t^XhOH!seii|XW2H*v5 zrX+ksN$<>D?LCjN0cKDzpcZk}{ZX~Z7xVhACNq9#9QDE+?cV8o{#T+v^;kyP1bVd{ z%nSsm@D=gL!DG2Ka&wNG5)l0}savd@uGv`T?*9;X|=8XU{z=};09WBXjt$SDGyPi$<_Mw zvawxC^S)^6624N*YE~K#j!mjKow^D#dGbl!04e)w?h= z%z^&;wZ&m32Z1%k4JiDA>L5u2XeVB064j^h3cm4`R#nvu zKHH3_q9o9rF%jpZ6mvZOn$^~U$H?#2#9wOTd2g7OM{f6}yy9tY?o9sU^V_>U^*mfA zeu(UW+|9t~wFx-@pmDBZu#T4ehq@i5@2QyD>gG2{FhuXy?F7w=w)QtXlJOILv$pwc ziCQxN@{dlt6XE0+r{#^SgWAWZ0hO_q9xFZALZRc(UKHcgEM0`xVP)fBgfM{N#p`!z zh2LZLyHB~6vu5TP7Z0D(c$T|yxu56jsMzA7n>j3c=g*QI(KHC7s%%Hp0;^4Re{qzQ zx8_HT-hQQ^+km3WFF3zWl+0#DcC7@XS(?5TmKUdbxjU*2UPsN5IL)3} z?dg94yp<3;`>lDcB0*CGQn#|lQ>zRLGKuAqh^yAl%F9#RF3^EXI92NX`h;03_2t8~ zkC}&jZnFo956Km7NlYh$ESgcV4hMAvg<_i;4-h`{zm8Z?hBTc`-*^n%qr)%9AbYEEFTqJBZMCBsUHo@GmDY|?j%>EC8YMXDZm3Hmu$_6!7 zKqR(CWs~%49t_A1xIvIl^b+RO6GBNf_ufF(%xIIJk`$;L!T)-$mDyGY;~AYg@!+Wj z?Ot0C7ZmK}W1X%bjNokV8-1=bsNFIOwNFAQGAl5UQ=b)92OuMWYkFb|oYu66V#?TE zxBRHVBgNnFIYjA2IH(QAlZ9|EF_@^@uv+XsSj!?Z>NjLs9bljGd6jB2c8Xe}*lR2_ zB|H%w|Eg&bKq8uneV}1Di+oVz#cnjGLGgR+h9Ij)eTxb=ZVj+Cn@RIT_xmi5Jj{$D={tCO z<>V!9k3$MS31jyMFFTDTuH1ynhUKDKsEz5vC+E_)}Elp>BO zrR?7ZEKHf8QRq?H+SuRI-Mi2z??2$HQB?-E5uAiZrCz2ArCkDBStPv)Zcq6bS?9e# ztV}kG4YY0{Vut~0Utloy_Ka?wto#UH2i!k1z4y4cmOpw;H&kM__s)R91DBWwsPsIN zub20cPWZ2NGZr)M`29Ok=HkF)$v2(eX}y(ZO@1jt+gbQtjw(K3O&FM-lz zt|}=K)BFp#R)MrsPyZOQXqIB^8SJs2+~|q=Pa_RxM&Jjcf^uAu{pLr$RDUc6nB8vv z3Enj)m;mW4o|^B9<;i;ONX#_8Rmx<-12cb1(!zmTyo{ARzuoT1SfVuyo0CyO<$iO|n$1)xi+d^ls&#O^51BG@Lc zsW6--0JbJmX^_CBmVP|`i!aG-A0@xc*>n?m#))+%&TrY0sm?s>)Siv-uG~J{l7H{H zxiDXeXTZQ6Gwc-;G3^aLpK!=b(eq>T)_42Di1d*TCobQZ$Prt7VwbB_t{5{9iqMx8 z`Qwf1p3}K4I~2JKf_p#^ zAao_-(HMTGatpIQWy{JJe;RWu8L~=5$^uwAhz<+!08fP|ciBtftuv z>893u4f2c|X;{&0A<1KOY$xakA8w)Lw5k8(6cju7@CKl%O<~BPJTfAsuNscRNR3Te?2^8w+);1tu|MtIU3yitFL)hNs zWe<6K3j%wKmJI?~o93i9;nDY3*V$HTX{z>ixm)#iyDLAV5KmRb7@oeB1w?vtql2M= zk>LSo2z7Z$GYw$+pIFS)R1E|Q$|XJu_MYDr1WGnPiFtfC_6;9~kvuG3DWpu z;>0KC&l^-BYXLyR-M#(%4-Pzh9Vkb@l*|^VCxNaM#dVz^3{|eKkDqSIREtT z`0%iQYWVo}cGleRYTEqHm{LL*-X6Te8>k#8CqT|tz^PxK6c|O;I^d5k1~C9A{}=N3 zn?5YFCnTyX0vX}nwjMA8@FsgtSm$OsK+gYd1<;b|5r7mtf2VK!@sDU8__v1>08K3Q zuk_39SwBL+)ek%?J!4~wD}CJ~@R|k?jR2cKKvF4pVPq%-&%n^^7ky!PbmNzIcX$TG z*vRlx?p?bRQb0)r)ZkI?pzq3w%?XIJor9Sx$ojW3;i8V^Zj0uh8J($}6?C(cgV?)t z9>+MO`D^c8uFM~wjup3;8~5EG0d#y$$B4d3dKVXi#d-jaZUEEa-)VOi!tXjw0386F zsi~G8m$osDynhElJ zpXxWU(&+3UUjGw)!(G=i{($Y@Ex@H;960R0pA+SO+#M}~z)#uCO9p59%M=KA+d*)RIRfAE1JbmPM}`%#abuI{dxz`(B$()BOBD)f8Z zfGR*{kdDn?o%-m=u@9QaHI2=0R|0{VWIXu14(74p>&4yfdz$7^9cL5Z24Kofj-cNT z4FDLL>FM8*7d{>H)!Ww4M-S{dzpiH4(#&1YFfQ+FZ0=vKDH~b+W zgXA~5!++mc{7b+FkTi||j92y!-ZMt|gYLi#oSytg00)pX>W4V@bxGzo?}fI+f6g1V z;kTgAJ!1R||6U9BE`FguzVOnM@r$4VD1-JFf^+9cD`4wK#PRF6)y4Hgyq{zE0Y46B z@cFDi{!90jR5n!h*7X2?v;+1D?ydee_p*1g^&4<^H1Qm_{stW|@hku8cKbzqgn#Si z0_V4VM}B(G$NB1aY37H1WpH%)9T)Ewb;Q5l_3ei;_E`Ipcl;Jj=l6Q38$Cr%zR&M^ zrdt!$^}`Re8|Cx`?u~x^3*Okf#QO`l_vG;l`&Z^Lqmz@p`=6%qF7>VU7ykE45fH=^ zU#i>np_BPq9&%do+NxxU!M%>drGu}qzjS`t6x@b$C$c#DrKsjk zQeh9GF0tD^ua$K$w1vFFndgHCo3woP){TH>ZXZS-Jui|4#>b!BPywCq-oxQ}0j4?B z8u%`uWZcd9L`;!xxZ4ds`OzGsQX>mOcUzsa{3Kb}yY_t+p$M@CUxQ0t(IB(QUmWac z@E$N0?Pw6#<@eMTj@a%qMuaT@Zs3h3^d_q9md)yVC~!?Ig{#Vmo{zJaZWnwUJ32yB z%5I@ZMRS|1`z%*Zc}sPBHcVoMwB6v;-E%)>+Q(Sk7-% z{>mffO7nV<_}Z|pE!;M|FPQVwkg^{^7vKKv6Ghu3C27qAUPCsIbJ_#Pz1#&>UFzFW zRM4MRZbCqGh-TLMv=BqOy<*6CEv)Y3f&nyW?XGA6dq_WJS;NS=o5$VcPm`e_YnJxy z_~ce@`Pd?fN0(^mU*v+2p-Jk!GI|#j5GipYUj%iXyF;~5ZS31`QT=tMN6fu-*fHjN zvC#T~Zl0p(KoaMtxOC^|m~ucW@xYQqxyMd61M=T$Dhq9)#0Yk^gasUK=?bs!&ga2l z(P3bH0f{}1P{ahi=UyqrlaVpP*G3%Lt!+Y)Uw?7op^syIt2w0la@A~UKcWvFc3)-6 zHF~4UOgyz6LwI`hnR=EooaK#=a7@)Fk-c(56}uoye6!*GOOgI(@U2eXZR1({n7NI! zkH(HfK3Yk;-rzQ)1{$f@lJ6G~J|?{4n3>5Q_}*=BO>Tf@QkE#SSW#L<`uB?6B8e=v zD>!2)`Pu;R;{vS4fwv8O|(058;V}{x)dCeb+)?ixP{CW zp^ol2hEY^n@>9|PW%J}s{tIMhn*)X7tgeD~Z4S3TC^ zLT`eGV-3j#%^}8xy+#yz$_32^Ww;9PK2O@26KA>eNIi2_fV9~(eZ7ABWe#E2l{0Ly z>FMoRz3XMiv^PGsye@5|?KIk_a}MH$WoFNlRM#V5AP>NR_j{BB38%mZ^Ws@G#I68K zWB&J)W9GUee$2+x!Q+xSv`%`Ml*~h^VVYO7$5Zp~qIW=LwRP;oUe2#`@XSbGb^9lU zoZz6H1xHtU(K*Zee$&C;az?)?&JZNasu@Y#)$QSyMxriBX;5#QH3}5*92(eu<32pf ziZoUwR0-2@@v+WJ8pmO9KbC0{pQ!MBqekZ*ZsRLjJ#)(OSxe&!g!=8J-^rsE^8;6g z2_aW$g!#Yi+4gr_4AKL{V&N4&EeE$jL^_L8dIPhSna-^ifF|=$7nHl%K$ItMeurT+ zgG_QGgjC#(vkk2|Z0-l(4a0_f?;jE>i@)Q>`Rc zr4;tc_&B48+gAy~wcw%Gt0?qZRryvk&PDaW2` zjfD+g8g@Q(kvvuwGBB%Rxr*GPz=j$3Z+{B$5UMurFd5SHuw2@Pnrex;D*I#HiczwW z%1EN=io7=3VfzMy@FHAc)OuIvBl?J;^LI)gfKRzFhP%rNvhm`=X&*z;2C<=kjmzzw zpPuZ>0cCKfVTh^KCvM7ai#sgE5prh({SGg869y}XJ@KW2M%`Mm#bCer2S9JJBu8yU zhmI8M=9NFbM>q1J3_lW=mkvxRu8W8QS@<&{=ekJ|>J4tdAEOx9ZBbqU7l3Wj*Z`GG zhGF-#D3*ZbQn;zDL0gmI-Z8lwl}x3~CF15TAW%k=%tUtxmOLGVdSVUC^Tjyw|Kh@) z5r9j#YKNbAy222_9o~I)ex<&gj86%ETx=^oYGJ`sF4&h~#XnUpw8i|D7T1Vx3vk^< zMZ36n1S_b`HN@8>4^tC5MMI3kZ?E{K%8opo$Rr@dG`{kDn}c*KFt~4a%8z&jXu`hi zWFct!E`4qkfsgxV%a!MPL?KMGQD6BmpaJI-9J6{q4vRhvbWDu=|0QGk>eH6^YMER( zOeq7b`RbuHM3S!rf8tut5ZW`zWRxh|gZM$@VkPJu+>5?Sg7|NuvnfNt2Cc>yI$KW7bhnRQ^Ts~du+X!hh zTlkg-V8DY)51kVUKTk0JyI{>uPsL=~dr%kf(iN!L{})d58V>f2u*EQ8zRiYyX#fzH zDBXmYB|i-shRl3?#};;QvQ}ndyIzA*<4h#(@KI1Zt8w>|H~OL>S#)|k5zHq|IIh%a zrzbQ1*JT)%{a2v;4su_j(pItKVtu_qrKbm2$yvZ?3T%bl8(VG zzIt?JJwRxNdHFgrXXx5{9Z1`0Cs(B3!<}va#p1l@PD1sFc2;8Ps)M}lp$6I(bMt`P zMV=8D>GmC~$m6Pfw9NNqtbHF(qYC+Asn66I8|kSbks-yy+-dc|>oZ$lb{8`CLqug~ zL%VhiJ7olDX7M2dZ|iieEV&_xagPfx`foY7)jao`?NB&wYwX2zfSE3axyk5nBEWQA zilGt>qMD&gO>{XUnlkl#dI+gz=3r8Ny)lJ-#-;etSyUZ^E4NKm#g*=vLY`9Pa^rWc zhW5Bo;P`h}qS3PMK^MiV#J9*ekBWImorr$nFv#ElZ3h4`G3GYR=3f#~zW60yL50G1 z0l@}}E7bcIMz!4C*^~Z8R6F9>e>=>=7&e7;YDOL``b6@^79nE{PdKoj@T&PEIOV*; zw6Di9Pvj{&VOD~6;8OI&ZYVhRt`p%9EFe6>MEg9Ch?By1OItkX=3NT1<|JX;{PDCQ zcL&)qcbr!&QrH=0(Xw!K6V-|V&KHbj3%|xB)3IsKOMgsdN|qi4VIsAmm0uk7{%RGp z4-V@DgnT(uK@?dqQjl(GuNAry8I>EnWDHJG77C*jcn(NhkfUIsPLxlPAz99V84}At zQ$+8qOp(Z~S)5(xgTU3oets0tiW!*Om61l#S;^R)!!MNbk-ibb7WmDRLmqF!JJUMg zKtCI60G!mRyQMB!)bg%MoQ<{zTOCN9Phayy^KUh=o-(#yo5T#piZZrTlO(W(Z%A^g z_oF)p%yh}#SO61Xo#@(z(kV6^)%)DFg?kFEnek??u&|rGg8~U z7uJf4?U@)SpI%oB5|FL~Tny&b`x-5Hq#H_%YHu_p!d;td+~?M19~PXnxMAQe4j~kq zCO67d!URiR0p7Z4o_&&aGyVSeRr1Vc@dsuS3F^?XIeWq&GbzGsTXwzLk%z(3fqO;a zdxAKzrUO``GFBL=vRz>!W0Z_0WQ=Hz@AefoVnOG20b{flWLBqhD~4_#+(-Sq7>mfS zDmzo~hgA0hVh7)s;(Tm^-6}(jX^F|@a&0diV@=J>Qz`O{Ax5PqqUW1jbyH+Uu{nWa zMHFt1R}~%T!*_K^EIcod`8rQ*GPz7l4wD8==K^!IDq~EEZnht?}NUP;Ak0&W2nEP>FU}ZD<%3Q_W9}8sKgwaSCL02BG zVDxY#xHVcpO~aR}u}hK*du#h!rkor!oG!KL42!!nu26yH*)Q>SowT;Q-pM($3A&*5 zkdj6`SjuhbBBx3bEAHs_&)uWZT5LPd^@Cs%*^)JsF&k^21Bbc1j?WkL?CU@wR1?We zIlZFNIwrkG(yE%OOGNDay3Ukp&&5s%%T_h`bF6AEG$rrT6D6QnJt>RJQt9I9SCI;OOBP0#7vJU3cL;J2l7f`2KZn+Qennuaa!q z^VWYKi!&*5_LV z9wP=xyIQ$EJn|V^)V|d!(w2G0QN{VsPht1nEH__FWoNWBQfBYj6Ra}>#$ltGNJThf-1de>Czye{yTx=}Gru;U ztO$Ny+bBq;$3CcG)7ozcgwc&YXb~2qw~~(4<3$Ne~2V89j5f)`rpME1f!Uw){3pGvA6DlR^tKC+QFG$5! zW`xl)c#3dRntM-Z1kd29?uuGC8WH$};9(C)^SiPpJPC#eKwep78<7eJ+61M17FDE| zZ=1-6iieV`&I$W3Acxg1Lw%9)&$B?+P~my>p)Am&hh`-DSTrmCUdkp|+8PQ9o=NrH#yl$NKJUW~UD7${ywT>FBYHY29yYcxC!4h8_JD)l9RH; zEOEpg%_b+`2dq8&lF?V1SK&TXcY>>j?a59_T}uk{6`qaleZ`Xke+UC?{Kj_n=;+nv zis#ZB?JrftbuUvUmehD*P&B(Wv*RRcH?{odChSM?op8&EZG|r9(<~o;Ek-|gj5p5M z?T~2N9f<&R3iaP14=-jj_!}flfcOF^?w|0XS#Ca@di+Mf(=q%8fEx(2Zr&*$jk#lk zVvnYw!ZFf45yk`e2JX7@kLw!)K44ayL}1!+EZkQO+I1sV!n1qlwVcH*W6T~lny70< zg&uro;(Y%J0={c*rPsRJR=5bOcmjfE?P{+)z@AVH%vl{qxUmfcmCw^SaYvYynf|J| zw;d`XmK)FpGP=gijfOQsmQ^zq79e)M#$65;^Hc`;icf zX~flhR-ZESUc2itf@Fv2SeKzq{}{$v0>QOic$q^cn$FM;!m*anaxhtj%Lny;`!Pz4 z(@klCYnohg;wP)HtSo?&c-SAYR&Aj3eM!a6whx<<&22*-+6)g6Y3zw`%~)yQOLgF~ zO5@CgTs#p-5z47|peT`SgNTZEBta_VX^J#-Z&W_XqR+%=U) zng1$bHjq?qQa#McjEyS{W(utM&HDmgg34*?2;aLzeJP6ZZ&&3jd_2&$ zzbl`4rLP!)At3+D00_sQyG@Le>_y3pz|)nMhbnfA)YK4+H;Sg%-7U#_1P^%e`5F>w z8Gw<=XL~?zu#XXH)I7uG8pRpQBryv!Z9Kg(55?^^!Ovt)|LR zI(8IM2Pq9MNZfY{5sAhogb1|HsAi{r+!AG^cPATQ}%QirBqP{4d4Ms@ zOAcS?ee=+1HrI_pr$zf7$Q9|(WUUtpfAXo>*s@V8lD!JiFaZ1}x9xH24UodN@K{>b zx8R!U(3Hj)QyjvNTfz1uDFd{i z;1c@jaD$xUK&qH)b^nw6%T{ z(Cy-L=?Rj)j}`3^Em!j!_7z4)YetnnJWn=wJmumrn8rbScC1!^!1u1n8Jg&MDq6-i zB`mkiFm^d1DWEyIR2#2X*Jo|eRgUGHHR&?w|AFhU5VB~uCM!N_AyYk#dp+t%vaGQB$bKf^(sG)ddx)Rh1d|o2@2t0i88r>iNQGfeIB|7$Rx?c*g zgMFUjrHb-eh|vO8RYcw2d(1zg9%%@-Qm}{AHB``;11G>DJEXy7$fp2ZV&Iu==BRh4 zpqaw3IbN@!{_3_`BhFs#o4}~s(psP1eSj`T*I2r)&zwMNda@WURyPpe0J9FEDNhE+ zr$G3EB8XtudVSZEioA54da}POCLBYP^}T9edlC48S9 zLdexH@p%p|{pGY67I-@g1|jMrf)N)_n^;O4A}&xbQX(U&E(WAV)r%{fDTINyRkufh zT__AsI@}Y6`ozLssMd<3iI*V-YolO1Z)8n$qvBhdqWo1=H7eXB-&fm2GjFq=_>DV( zi}vK7#YJ8ySkQvxKNnF;3$HBO$AQ6vuwb#Y^7U~o$~JojQ}d{i4jd`Irw{Vusn@OW zRxwVi+JA4Yv1A*!VI_yxd#y&g6(7LRwH}2-<+mN!Rd!q=;!!o-$N$?gg8T}a;%qi>QB>yqhp+>OQH#TKC%HN$Jk+g%kAz~R+b`Ak5ag@*XR>ygkmvD}{<{z=>HN~7{f zj+BBi~gr{PlG;oF@?) z@9OyMchZ~)SL#BBINj^{+1XtCFVXwx28Bz)LFHQ1TU6QD&Z`=Kg#eey8E{*&Sqp_? zvy~^C7GH5m)b@}~Hb~L-an3-k%}621=IC)a{!rd{m8SkvX^(a!k+0Rsae#ggiUa>z zo=}rv8aA4;V-8GJ-~mtll3Y+cHkUqn2M46i!?Bnu-ws6?)8JnzJ<(c^naf1%AlVw~ zIvfgqVEm6`G`{?-tIo6Xv$VrmH4(o$NNMfcE_OlsNHvXo?qkYMSq?UzLyvO(CZO}m zM`*S{v>NgM+>f7zG#|2`X2|wSyfzNb3#WkXG7HxlJHBqcc8VU*=7hOwA#r7V;!6tr z0br-Wm$RA!SZa4sK+ACsW22O2jH0-4-qo+uY>6U@N5b|{@tYklvh*AmGH1$B4R1wY zaHD)vFzEMY@o%%C=dPs=Ul;nT>Obq$A&@^d?R%q2_0yBQu!vunSyoUJz#QZZ9!u3t>^t z*XzOWzf|ENc8{St3BdltGzTMJzFym4VM6em`H;oief>(+x%{Gk;B=Zp_4dtMHvcir zs@tjg;u|n`xuPuFv=m5&lF~Cw+)Ds|#gh1(ojPH4Uy_FNdMr^gr91KFL>J0xsb<|Y z2qT46z_#5=U}CY$IlvaQdm;mH1}!*>lX@5nSrqcrxJ-WSi!9{fFLnPnhx8ht-)`>_ z2U}_IZxG#r)|CUf#08#_f5`f4CfbX>2t%8zt!1h-$1*$O?7szNPrQi7Mp&s|n@u&Q zd99ivYcorjQ#qPWIGNY9ya1Rrn!p*Ldh?Ue43^|)C*ik|NFl_Q7=g<>tA`$098Qeq zd5UEjNbg!pDzw1jPz91~-|e&b5q7Q|_+%fTfNyXrFK_m829u?%jAgO!YBlY?0NsN2 zd`xjFbs7bV(SIT1!S%gh!|#`hxUFtv*H}^Rl*0%lWtreVA|&fj>NY_08EAcoZX3$* zG`5W{)>-gp$V|r4z$(WcPkGDb)#a8iG>vEy z3z()NTDk{C>O`L+YOikaCdTo*Hl{@|uYk9~0S+LelWuTSCUz?nNG@*IUHs8;(+k;KC5?i<<*< zN5)hDlTyv~y=3sqf|#`{USnbta&~aXXlwl64bR0t=h44}kg&d`I4Yk&#w^1cl-xf@AKne^WxE-;*=wocP_h`UTz-TTEyi7bh_^l3t_c@Y zq$&z%p5?g?#`4k>ifY%2b2rA;uj62Hfdet{C(o#ZGW+P}JEui|i=C#(p5~>pwZ!^x zh~xanSP^Rf)AOA6O2xj{pT;mfu$j2)jt_6$iH1Qq`RYDSw9Dim`KDHPBp3fgWY)Y? z?`9YqC`*HM_%qtxE?TllKwZiq-%@OD1`WGyu>4K&0;U&CvI0ijF-wpE+!Hh@i?B)C?>K#wl{2pXM<7gj@`dXrkaA+Bn=v|@xh;xkw>$9 z*`8p`s&HkW32Pgi6?u?=YEXzwiJ>{f&tAjig#aCf`QZYk87DE}Jz*a;>3*rQ%@dbA zR+ToCfKzKSe^%(R;zuPnGhF)~qoe`?$4HLT-D`O*^!9})K?0J<}B;c8Tts0MZ;?;D|$E1)^ zK(C>#p$5p?ElzFPTB@fQTBVMih~!=ap}!bvttdrtt_otPxckduua2b$sefQtpyoC< z1PK9tlXW;`tm7|PiHr-iWiK>13)*J3L_0WB^u7v-{H3HrSF2Tn&F2_Xp6wI_^(FXu zSOm9U@8oTI^zNNK?Lar!QcTgTE?!mhp;;Shz$$*bb&=o#lbwdti)Ua!4gHMW>vYO= zP@;_5tsJ-qXU>X-#4=ifc1$UTm*ZOXQc`0VA^X!bl{rmHzs_Xp=jE!YJS}P2x92Y2S zBCx=rx zq?Wnwvo!;N-*52ZukxWgoQ(|_LS)*rDmAnd%@?K2`a(oNxtmv6`r;r18Hlgv;a(}+ zlreIoX^1L_a_aiu0wFFDkLR8#S@Y@xn~?vQ2Hnh5#1jiFA`Ci}5|I=wcrUeG*{ zyq^V0>(A=$U8}2WecikEZqcs8va1lB5>JZ=nI$~E~(uAjroh0JF*EvO-nG`CMm`G35FTTb2p}(3HKZrdiNx0nbD)7F% zNS-QM$E;sX4y$)FzI@+M=%kj|<8cInh=7b);c<&|lCU%Hhs3=*F2#7ZH>_z zuGh?QhZk&2VGQ=|W;z$Ix4ZDsTYP2*7pg`!ht#4OhC7lxL|`A#wPYkmNzD@S)>sTl zmDt8|Oe^7L1Eb(UVR5Yp!ciiJkEW@CAZh8{w?o-NRUj}@L_R&!;HW<)<0+b8@+u$5 zCcHq@P2Mwf=nyI%ZKJ%q56RJb2^#tUIa2TOfZ+!ppVkL~UTC-1CSqOo9eHx{PxFgq zypH%WkjS@3%s#JpSkW9BlC9N7GdhbxIwG%kbMjex30HmYs0Wtm{r;aHJfUH;U48|-hB?3qV=MW_` zqrwqmVGu;s-Pz3DxFbX{86TFNT@Y^j=7_76R2im_YF{VADXwvGrn655SKo_J(yZ5D zP%OyaYT13_RP0}9b>AP@mO4Phn$RdTKdJmnYsDR$&aRXihDEiNv57|7IIoRDnQE+` z=)pjzSaI$uLLcCZc7%b9prJMhrLDGfF->59P?Me+;+0kr&4Y&0ao~lq3U~6KwZ5Hd zWpAN%>j@d8V|Pv}%TQ`-9;nmk`HJHav&JeqBK0n&Fj=zo!tQo$D3rEJwRxc8&(!d< z@iO6SP;X$=`ujzLpK9b9yOpE6=Ozxif@(E+XXoxmVBMzqHxctPn&mX3Sps9hPVi}8 zp^#cy91MB)-TJlV)#VpcI&s^UMAnrOm+d=>Xu_v9=4sy!H`Kv=qO5ta98~ISn4>;8 z%{Ofh%9xnwonqIU$D6wCgKza)Gm9%*v+%lASHNh-^VICd*Qh%@xQMrDbDrhZ^-oF7 zqIL9t@)PSDny8@x$9qQA1frhr#OY}PNTgo2xJ=$R!I`pvsI|rpG z6yV(rc*$Um`xllAVaPE>UT`8h(>rRL}&ZXw^8cgOg%u@ky_X~@`@>%OC&Sz z9Fb&Ax)9gOcA>=5*WYZ$TjkMmCaOb*uYIm0S3I*EN==wZdll0b`kNpjFn_y4Lq0Ne zq-(j&QjZm|%22GZ>fLJeYhZ0gN%MO%!TeHI8#(nRssF~W+EcQL$*v+^wQSQ@?e z&(PwLE&;vCP(H%-#!?w|^lwvhmSr$BiWqNslA1c=eM!{vqm^}fp^IDLY|MG|sXLN% zo~*d{Y{bNLObu1$xlC!+#{9u;$1{1iFLL#R4Hk%{ zcPWRDRX^7kWS}rqOaAbPJ2S&~3{0V1X?K2L;w$;s=5ufh1KLS2J}et9ABR&Eg5RRA zT{V40Z7}fE9*E=5#|WLu$zv>$Zj3$(&=hQTD$hF7rs~SClQPJ?IFci7`?mP_YtXI# zeGw$Q!MSV?6U|9#UF$JxLp+oyg>8BEA@xlYsl96}gVL<7{xnGp}Dp~Q3<)4bp^lRF?gUB@{x&mLe zQbZ1W4ZnTI$oTjOXAsBKq;S@$DPZc;%-2K5L+nFSYr*TxS1Ga`y(G|w1-61n=hsO; z2?R<`_qCf_6FgyQ0Io#|iXY!1t)Kd6viGqKGk$w2%>#meJnc8w#xCpdY2V+zS}$Sv zlD%B|N_2=E?Ne=@U}hHH6J(sk!473aH2VjypXO5YBb{*nhie&Xb+#|_VqU0l!b!C8 zbX1_u(r$jx`(IgY&ol0-Eg&r*CeMmqgDh>*6wk~B7chNDFy)Wj%x^SvkM&UvbHyg_ zDt34%9zWwXI5w^S&d&2GO}i({-c)mbJDe^)g6&YzeJ0o#Vp{)J(GzZ62qi_wPO+&_ ztbuZZow05pOjRZ&9rqbY6+LR@16M%|%t1obu|2gNTy~ZEk#d{0fgUi;1>Z@xXxR5y z0Wg%%W$h#2Fs{pfuwZG8#(Pz_OMDg~)0k%R-sLx^Qyz_sG7aTontAhn+~s zS!0B=ET#QX>>x>#&MwU4C>F0AYeMKL|v zqn(=5v7Rb&p>K`I(mv=ufM@GggL#$v4_fs{Oq&CO(DVT8&JASgHwj-m`?E{*ZA^;x z1zvjp%Dr7~A6UY`r)!p>vJ{s1;Nf+}AwmDs0zK*V+i9y>tIJKuUP8z$smN0nA3}O( z?Q#K&MK@Jg4P5s!r5fmcn10G%?e&rQiWaiL%Jg8uvSzoq9Ofbu&+checHS~Pi^@+& zB(Pf$$7Gs4>B%9xi2F10luvDzKQXQr7b;UhCLHK#TaG)jjsBKa$zrk6KdOYq@R{W& z#|yu+<#?UV?YA7|6SI95S5HwT8zhUOwqco=7g?PK$4%w&#)CABeDq>R=y$I%V$aU7 z(}{{4Gu2$j1+zBEv*#l{?CXThC@!wwJ5DCr`^X0u%?*ok{`@%GNXi~%bc9V4Ta6_5 z62q7Knzm4BjYGH0l~^a4-=WgiAe2w+Yn6b({@@1raRH$R!z5S6U=f)Q9-qad^G>~N z>13*ZC-!E}@mhU>zs#&?6wPEIyn3=1hLLssg0Us-2YRVdzJJM|?jcxldiaCBB4)e+ zePA;Foyn&?kOKYwcLHtIL`AyF_#pv11TAQ&K}qglLqGVF1wrW8g5ArVFpLZ@kP)_& z>q;wVC+OR1&A_WK$Far!Z7ZBRia1GGRFVn0G#5b+>j_ceBeGY?pVOSPFTESeo`b&* zWGvpP^g4mvo^m#J4*&_puCx4N&YKQg+=~YqSY)xjPEb%2Sb;=O-|N5&z4nqzRU|O1 z90juw8?pq?Su8M!IaRwWbs_~~*YP*w>hx{u?8P>)VqSl;@7Qb#B_O!q6yyJRf_ea) znSMs0Mf~d)6Luh6RWr|x8n%Ebba7O8NAiS}g9Cq8L=>hCs2cnx{&MTCMA`?X7xlzX z#N-Numgz93tY+Bj)ZgN3Wt_{+V<=HCBPwvwt4VG&oRI2P5f&K9`3xVMR4}=m+ORzU zL08|P0Ncz%jEVYp^1+rYSVBAG%W~SNNPP~sJ5yc8RT>+TlD4*QB1Q`z^Y%B?oQf*) zI~;fP#y%-aK2=xcrakSXtQDRWYqt*~i) zaaf*iPKUr;T7`xZEp`ZgkPABZl#5Ygqi9NV%E>5h+gfDYTe7NiiAVcqJ6q~G4q1 zoDK6j617t#4!hJ>S*0|K>5}3M%2E!RA&3kLxuSlxL~(0t;9l9QL2vxJ5bjPX7wy^6 zN-mR_`h{mrkWCgL>`2<8=Vy2XeBT^3>eJEroi(f98eymw=dgnErNcS{BDWpF@i_Qd zW&y5Z`dI?Id%m0GNGD}pgXKyK*Hn%+*yNjIhLK@@8O_=kM}~mVU8ukw$VJ~*lIsN2 zzJZIK>LD^vSJF1=MP|VNeX?;rjNezKp>L-k+U*H&Q)nQwp%|war^P{Cni`BBU6j7- zi7~s)7|ELKR@&cTnbPbb{ERb6g(8Gj7aKqDrwdomG)Nh>_p@`VhhU$KGxu`}o^Jf8 zp&n<4>HeD)EkQm|xC&V6!v-rE2QkpVtZUmyZK5;3eo(6`!a18Rn>a7^h_K2~@UroY zNu^&Y+ePUkL?fjLoBJN}ORkBQ$KcD;&elY6nN+d{>9^)|; z`c#Ru&~Ploue7~?ZRG))H7oD*F1Y3P6aDm=uKc*b>Q$v-X_2{6nw`wH#Mac05`8%m zX$ci>#`p4aE(ge(r^fiyE^O-7z0=fq*XqKIV>_y1uO@ZS+kU-p_TT|)LnG=d&>=23 zA0JaNUut5(55ySfuBNy_W>jaUY@q*;j~Mne6_tW8(2clW)c zke>ivZ%gqP*%wt8Goe1AA8RhWS1Iqe3Bmy>QTIbPQA$RV;fYit98ZRUo^OnhHwtD2 zYO=$;v%k$X487*b_xFg5i4fky&n~MAV&4yo#C@E&lltbPG>P$D$z3Ku4MqZ*s97s% zJHMMx`7KkAR+3|Zkn67_o5#6s%JjbcF06DZ=BT3To^X~!Q3ngnEekGn8JUn zzyu2Oi~ggo5(orvD>^tsZ9IV7ip~~L8+jWmS8E%9loY@d>S1Hy0`OgQ(N%X{=OJlF z==@t>s)jzW=v^$e=5U165!om?VZs3E?lC#lurI&x8JjT?`xt7 z{^>Ht5U{yFmfPxSmZfs(>4|fAcv>Sy0}_{@(8=-7m6ZH6*I7AhYksxBX4W`%Y(8As zwD`e^|8pm^!`Wm*URdD#eH&7;9IsWryN@-~6k4}a<~-Nn6-?OI1S5Mkyj${;i~h=lYTv^A4TpDsPz+2S64shhr3Qz(2t-@;O~Ipsn5n3CDng&)a(bmg|&&*etQN)vc175$Kia> z3s|-^>b5Q`=V^|tYY-_28<2U49Z&olK!)iMHq0PL&ov>JAN|JI=Toa9`fvake7|u1 z{6NCk{DqPgs~-+WIUc$MLsh6<1omo-_i8t<#T?>uy%9ECTX`-G)XytLZWq+vEpoMi zv$0O|FvRNWw2Fn0i4ehlf`7?CY_4LwP5d2gj^7+1W1F-gF+}+W-6Xu1Ha=n(;H%+8 z_0?~qWn+ShgugiZY( zyv3~Zws^y(SKL4FqQY}!33F1i=`g>Kc&=gVhMK00Uauwj?a-v;y6;nzJ#$?m1O-3h&&tG7LOppD%YqBH&xGEJ?y)f6A9t6e1F zc$)i-u9a}Ag^qLH2L1`uph-%a{QDoJ$-loXI=3$T4R3uY)u9y7=NDt;))J z!ZW`2y*^%-S5Jo9#w{8U)#6Q){$pFN4{qeQn$y<=V~>crOuRL}GAl+F{w&_pAhYJ7 zb`h#-QZkzy|I;pQB=pC7kOFaEUk_Sue%4yi)Fjtj>)ho-BlO0z(_-)Qvi$7KA2=iJ zhFfk~R_Q5YyJfm_K_dzE2np)46-`KA5l(0^eo^-j0H>(?b@%H^Bit3QiVr8{W*2rV zX6Yxsq_+Y|a0bmMBLvYm5_kq^rcaGgG-NUHL~zOXjK!^L!m?#8)f0WuiCVD8k$#zs zXq1Dmhp_k^gI%3M#T0PdpNO?2gmPPxQwYR}96t}yDis)#f%9QX(is=^W*f$^caOm0 zDW}IhH(C18x1YZhWcX-Eo{%wwe}n50*PRj)9<AfD;kCd*|o3h zNbq%B^lj*ibYRyzhAuEDs2m+xOmwPNyqWQBh`ac4g-21kW%3?-o@$Y97P1X1jSVYt z4k>XCD^aB-`^!k}Mg&Z=BFp$swYZ0sI_PK`9~To%BQ1@WhSY9Yz%(P$UIc0X6lu?Z zw6{DR;D7|J(L@E4+^y9mLq-Z)&pTGwzcjd+bQjp?b{sU;900LtjAMRaMAc^S-C)W{3xgHYW@F-fITeS+-$5LqmYHOrwxGn zHCVxbOVP{O+0p_6`AZkDy@Mz4as5*#4xpZmhv(zbKz=S>0bW5-J|RJVPChQ7e=GYB z6@efJZyVsFR(#ynHr}j2CpQbIy{C-@(EDEx3vuyt3H*ciSH!b{Si4#}0!06hEx>+m zHb8DoOGh233xHeWu~ore@zTQ+3jE)W1wau|-v2t79_X34E{Wj;Ue&H{P(?+(zr!U# z%Z56{|ALsL3iX9ad`ZeCce5PVYlU_N6>SP?v|4<5+limb7$+ei!Do-T8iucegIaVf zGnE^HseJ<+B2gtE3Y$moiHa#9kSDk%2#x9Mj+JTUsx|@O46$3F?BwQ&iTKr!th%&< z@$kMS>QVfBrHoX>O9Jaa_CSK~mdPR0begxn@oc$X6!07(K@89QWp6yA%^l+q68jH_ zRCXv~oZtM7i?isyL%Vkx^e&H zL^(f$7BOKHqI#^qY1?#&Q1O^dy3PZj-&s^j2#EUi+`oNS*}QvkX}e0i*{%xHhwX~| zK=*0%uh~Aer|IjA7~`hOh-otOId7pX1iYr-WYt*E5U$@^<)VKLtJ&H7u>kd#3=seF z0xM%*p|yzGhU%t78h*`{t6Z3nQ^zz7|DK)DrjrRn0IRulJB=rWE-5!T0=((2k5x+* z^58D-BT>U>oEkezc`&7$c{`b(S*Fh}hd%U3;T=nwt}>dg3I1BeY#$&@au4(IP1dy=61( zi?V|XA%gUZGmlW+PcC(X7!()s`Rbj0CCWLZj?{7;FrFvo;|*|u)q%G(Vw)*~T5J+2t_^OXfkv)_d9GA#z=my$0wg7Gw zh_#I`(1-`f$7cfI)^YHE^dWozZhfE;FYvKj$s4b6;y9C8;khTC8DYIJbuH|Bahzv4dn zBxOR{{tNO_ve0>n1pZWVazqqA3JON?Gw#3J%M)th0rmBGw2%M~pRg!^kx@ZY5%52~ C53;@h literal 0 HcmV?d00001 diff --git a/doc/heatEquation2_example.dox b/doc/heatEquation2_example.dox new file mode 100644 index 0000000000..55a2c90c70 --- /dev/null +++ b/doc/heatEquation2_example.dox @@ -0,0 +1,14 @@ +namespace gismo { + +/** + +\page heatEquation2_example heatEquation2_example.cpp + +Here is the full file \c examples/heatEquation2_example.cpp. Clicking on a function +or class name will lead you to its reference documentation. + +\include heatEquation2_example.cpp + +*/ + +} \ No newline at end of file diff --git a/doc/performance_benchmark.dox b/doc/performance_benchmark.dox new file mode 100644 index 0000000000..0f4a632781 --- /dev/null +++ b/doc/performance_benchmark.dox @@ -0,0 +1,196 @@ +namespace gismo { + +/** + +\page performance_benchmark performance_benchmark.cpp + +The aim of the performance benchmark is to provide a ready-to-run +application to measure the computational performance of G+Smo and its +underlying libraries on your computer with the specific compiler +configuration. It implements a suite of benchmarks that measure the +performance of certain low-level operations such as the computation of +the dot-product between two vectors or the addition of two vectors +(AXPY) as well as high-order operations such as the assembly of system +matrices. The performance benchmark is particularly useful when you +run G+Smo on a new computer architecture (e.g., Apple Silicon M1, IBM +Power10, or Fujitsu's A64FX) or updated some of the underlying +libraries (e.g., Eigen) and want to see if the changes have improved +the performance. + +Though the performance benchmark can be run in sequential mode it is +recommended to configure it with `GISMO_WITH_OPENMP=ON` enabled to +take full advantage of G+Smo's OpenMP parallelization. + +A list of all available benchmarks can be printed by running +`./bin/performance_benchmark --list` which yields + +~~~~~text + G+Smo + Geometry plus Simulation modules + version 21.12.0 +Compiled by GNU 8.5.0 (C++ 201103, glibc++ 20210514, eigen 3.3.4) +Running on Intel(R) Xeon(R) CPU E5-2687W 0 @ 3.10GHz (memory 125 GB) +web: http://github.com/gismo + +The following benchmarks are available: +#1: Memory copy (native C array) +#2: Memory copy (gsVector) +#3: Dot-product (native C array) +[...] +~~~~~ + +\section RunningTheBenchmark Running the performance benchmark + +To run the full performance benchmark with the default configuration simply type + +~~~~bash +$> ./bin/performance_benchmark -o benchmark.tex +[...] +=== Memory copy (native C array) +... 100(50) +... 1000(33) +... 10000(22) +... 100000(14) +... 1000000(9) +... 10000000(6) +... 100000000(4) +... 1000000000(2) +... 10000000000(1)[failed!] +... 100000000000(1)[failed!] +=== Memory copy (gsVector) +... 100(50) +... 1000(33) +[...] +~~~~ + +By using the `-o` flag the output is written to the file +`benchmark.tex`. If this flag is omitted, the output is written to +\ref gsInfo. + +In default mode, the performance benchmark runs each benchmark for a +sequence of increasing problem sizes starting at 100 and increasing +the problem size by a factor of 10 until the total system memory is +exceeded. The latter is indicated in the output above by the trailing +`[failed!]`. We will explain below how this case is handled by a +`memory_safeguard` mechanism that detects insufficient memory without +trying to allocate the memory in the first place. The value in `()` +indicates the number of runs the particular test is executed. For very +small problem sizes it is advisable to run the same test multiple +times and average the result over the number of runs to reduce the +influence of inaccurate time measurements. + +The outputfile `benchmark.tex` is transformed into a PDF file using +the command \c pdflatex (see https://www.latex-project.org): + +\image html figs/performance_benchmark_memcopy1.pdf + +\image html figs/performance_benchmark_memcopy2.pdf + +Each group represents a different problem size. By default, each +problem size is run with 1, 2, 4, ..., `omp_get_max_threads()` OpenMP +threads, which is represented by the different bars. + +A list of benchmark results for different computer architectures, +compilers, and operating systems is mainted at the G+Smo Wiki. + +\section CustomizingTheBenchmark Customizing the performance benchmark + +The performance benchmark can be customized using various command-line +arguments. One or a subset of all available benchmarks can be selected +using the `-b` flag, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b1 -b 4 -o benchmark.tex +~~~~ + +will run *benchmark #1* (memory copy (native C array)) and *benchmark +#4* (dot-product (\ref gsVector)). + +The problem sizes can be defined by either providing a list of values, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b1 -v 100 -v 500 -v 1000 -o benchmark.tex +[...] +=== Memory copy (native C array) +... 100(50) +... 500(33) +... 1000(22) +~~~~ + +or by providing the smallest (`--vsizesmin`) and largest +(`--vsizesmax`) problem size and, optionally, the factor (`-V`) by +which the problem size should be increased, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 --vsizesmin 100 --vsizesmax 1000 -V 1.2 -o result.tex +[...] +... 100(50) +... 120(33) +... 144(22) +... 172(14) +... 206(9) +... 247(6) +... 296(4) +... 355(2) +... 426(1) +... 511(1) +... 613(1) +... 735(1) +... 882(1) +~~~~ + +Here, the `vsizes`-family of flags refers to all vector-type +benchmarks. Similarly, the `msizes`-family of flags (`--msizesmin`, +`--msizesmax`, `-M`) refers to all matrix-type benchmarks. + +The sequence of runs can be specified in the same way, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 --vsizesmin 100 --vsizesmax 1000 -V 1.2 --runsmin 4 --runsmax 80 -R 1.3 -o result.tex +[...] +=== Memory copy (native C array) +... 100(80) +... 120(61) +... 144(46) +... 172(35) +... 206(26) +... 247(20) +... 296(15) +... 355(11) +... 426(8) +... 511(6) +... 613(4) +... 735(4) +... 882(4) +~~~~ + +Here, the smallest problem size is executed 80 times (`--runsmax`) and +for each larger problem instance, the number of runs is successively +reduced by the factor 1.3 (`-R`) but not below 4 (`--runsmin`). + +Finally, the number of OpenMP threads that should be used can be +specified globally by providing an explicit list, e.g., + +~~~~bash +$> ./bin/performance_benchmark -t 1 -t 4 -t 8 +~~~~ + +runs all benchmarks with 1, 4, and 6 OpenMP threads. + +\section ImplementingAdditionalBenchmarks Implementing additional benchmarks + +To implement additional benchmarks, copy one of the existing ones and +adjust the constructors and member functions accordingly: + +\snippet performance_benchmark.cpp Implement benchmark eigen dense matrix-vector multiplication + +Here is the full file \c examples/performance_benchmark.cpp. Clicking +on a function or class name will lead you to its reference +documentation. + +\include performance_benchmark.cpp + +*/ + +} diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index c3b19abf5f..191d851162 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -56,7 +56,7 @@ class memory_safeguard }; //! [Implement memory safeguard] -//! [Implement benchmarks] +//! [Implement benchmark native C array memcopy] /** * Benchmark: native C array memcopy */ @@ -111,7 +111,9 @@ class benchmark_c_array_memcopy return "Memory copy (native C array)"; } }; +//! [Implement benchmark native C array memcopy] +//! [Implement benchmark native C array dot-product] /** * Benchmark: native C array dot-product */ @@ -170,7 +172,9 @@ class benchmark_c_array_dotproduct return "Dot-product (native C array)"; } }; +//! [Implement benchmark native C array dot-product] +//! [Implement benchmark native C array AXPY] /** * Benchmark: native C array AXPY */ @@ -230,7 +234,9 @@ class benchmark_c_array_axpy return "AXPY (native C array)"; } }; +//! [Implement benchmark native C array AXPY] +//! [Implement benchmark native C array dense matrix-vector multiplication] /** * Benchmark: native C array dense matrix-vector multiplication */ @@ -295,7 +301,9 @@ class benchmark_c_array_dense_matmul return "Dense matrix-vector multiplication (native C array)"; } }; +//! [Implement benchmark native C array dense matrix-vector multiplication] +//! [Implement benchmark eigen vector memcopy] /** * Benchmark: Eigen vector memcopy */ @@ -340,7 +348,9 @@ class benchmark_eigen_memcopy return "Memory copy (gsVector)"; } }; +//! [Implement benchmark eigen vector memcopy] +//! [Implement benchmark eigen vector dot-product] /** * Benchmark: Eigen vector dot-product */ @@ -383,7 +393,9 @@ class benchmark_eigen_dotproduct return "Dot-product (gsVector)"; } }; +//! [Implement benchmark eigen vector dot-product] +//! [Implement benchmark eigen vector AXPY] /** * Benchmark: Eigen vector AXPY */ @@ -429,7 +441,9 @@ class benchmark_eigen_axpy return "AXPY (gsVector)"; } }; +//! [Implement benchmark eigen vector AXPY] +//! [Implement benchmark eigen dense matrix-vector multiplication] /** * Benchmark: Eigen dense matrix-vector multiplication */ @@ -476,7 +490,9 @@ class benchmark_eigen_dense_matmul return "Dense matrix-vector multiplication (gsMatrix/gsVector)"; } }; +//! [Implement benchmark eigen dense matrix-vector multiplication] +//! [Implement benchmark Poisson 2d visitor] /** * Benchmark: Poisson 2D */ @@ -529,10 +545,7 @@ class benchmark_poisson2d_visitor return "Visitor-based Poisson2d"; } }; -//! [Implement benchmarks] - - - +//! [Implement benchmark Poisson 2d visitor] int main(int argc, char *argv[]) { From f2f3e1a94dde16d0516115f54121fff0c0032d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 14 Dec 2021 14:46:00 +0100 Subject: [PATCH 087/174] info about extra libs --- src/gsCore/gsSysInfo.cpp | 84 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index 9bfece3f07..a0cdd8de14 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -500,10 +500,90 @@ namespace gismo { std::string s(""); -#ifdef __INTEL_MKL__ - s += "MKL "+INTEL_MKL_VERSION; + // CoDiPack extension +#if defined(CODI_VERSION) + if (!s.empty()) s+= ", "; + s += "CoDiPack "+util::to_string(CODI_VERSION); +#elif defined(CODI_MAJOR_VERSION) && \ + defined(CODI_MINOR_VERSION) && \ + defined(CODI_BUILD_VERSION) + if (!s.empty()) s+= ", "; + s += "CoDiPack "+util::to_string(CODI_MAJOR_VERSION) + + "."+util::to_string(CODI_MINOR_VERSION) + + "."+util::to_string(CODI_BUILD_VERSION); #endif + // GMP library +#if defined(__GNU_MP_VERSION) && \ + defined(__GNU_MP_VERSION_MINOR) && \ + defined(__GNU_MP_VERSION_PATCHLEVEL) + if (!s.empty()) s+= ", "; + s += "gmp "+util::to_string(__GNU_MP_VERSION) + + "."+util::to_string(__GNU_MP_VERSION_MINOR) + + "."+util::to_string(__GNU_MP_VERSION_PATCHLEVEL); +#endif + + // IpOpt library +#if defined(IPOPT_VERSION) + if (!s.empty()) s+= ", "; + s += "IpOpt "+util::to_string(IPOPT_VERSION); +#elif defined(IPOPT_VERSION_MAJOR) && \ + defined(IPOPT_VERSION_MINOR) && \ + defined(IPOPT_VERSION_RELEASE) + if (!s.empty()) s+= ", "; + s += "IpOpt "+util::to_string(IPOPT_VERSION_MAJOR) + + "."+util::to_string(IPOPT_VERSION_MINOR) + + "."+util::to_string(IPOPT_VERSION_RELEASE); +#endif + + // Intel MKL library +#if defined(INTEL_MKL_VERSION) + if (!s.empty()) s+= ", "; + s += "MKL "+util::to_string(INTEL_MKL_VERSION); +#endif + + // MPFR library +#if defined(MPFR_VERSION_STRING) + if (!s.empty()) s+= ", "; + s += "mpfr "+util::to_string(MPFR_VERSION_STRING); +#elif defined(MPFR_VERSION_MAJOR) && \ + defined(MPFR_VERSION_MINOR) && \ + defined(MPFR_VERSION_PATCHLEVEL) + if (!s.empty()) s+= ", "; + s += "mpfr "+util::to_string(MPFR_VERSION_MAJOR) + + "."+util::to_string(MPFR_VERSION_MINOR) + + "."+util::to_string(MPFR_VERSION_PATCHLEVEL); +#endif + + // OpenCascade +#if defined(OCC_VERSION_COMPLETE) + if (!s.empty()) s+= ", "; + s += "occ "+util::to_string(OCC_VERSION_COMPLETE); +#elif defined(OCC_VERSION_MAJOR) && \ + defined(OCC_VERSION_MINOR) && \ + defined(OCC_VERSION_MAINTENANCE) + if (!s.empty()) s+= ", "; + s += "occ "+util::to_string(OCC_VERSION_MAJOR) + + "."+util::to_string(OCC_VERSION_MINOR) + + "."+util::to_string(OCC_VERSION_MAINTENANCE); +#endif + + // OpenNurbs +#if defined(OPENNURBS_VERSION) + if (!s.empty()) s+= ", "; + s += "onurbs "+util::to_string(OPENNURBS_VERSION); +#endif + + // Spectra library +#if defined(SPECTRA_MAJOR_VERSION) && \ + defined(SPECTRA_MINOR_VERSION) && \ + defined(SPECTRA_PATCH_VERSION) + if (!s.empty()) s+= ", "; + s += "spectra "+util::to_string(SPECTRA_MAJOR_VERSION) + + "."+util::to_string(SPECTRA_MINOR_VERSION) + + "."+util::to_string(SPECTRA_PATCH_VERSION); +#endif + return s; } From feca32b4c88cb6e51405efc2ff43d4289aa5db4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 14 Dec 2021 14:46:28 +0100 Subject: [PATCH 088/174] print info about extra libs --- src/gsIO/gsBenchmark.cpp | 2 +- src/gsIO/gsCmdLine.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 0cf2a420e7..a6e017706b 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -131,7 +131,7 @@ namespace gismo << ", " << gsSysInfo::getStdLibVersion() << (gsSysInfo::getExtraLibsVersion().empty() ? "), \n" - : gsSysInfo::getExtraLibsVersion()+"), \n") + : ", "+gsSysInfo::getExtraLibsVersion()+"), \n") << "CPU " << gsSysInfo::getCpuInfo() << ", " << "Memory " << gsSysInfo::getMemoryInfo() << "\\\\\n"; diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index 89e25e56a9..cbbd378432 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -433,7 +433,7 @@ void gsCmdLine::printVersion() << ", " << gsSysInfo::getStdLibVersion() << ", eigen " << gsSysInfo::getEigenVersion() << (gsSysInfo::getExtraLibsVersion().empty() ? ")\n" - : gsSysInfo::getExtraLibsVersion()+")\n"); + : ", "+gsSysInfo::getExtraLibsVersion()+")\n"); gsInfo << "Running on " << gsSysInfo::getCpuInfo() << " (memory " << gsSysInfo::getMemoryInfo() << ")\n"; gsInfo << "web: http://github.com/gismo\n"; From 1a13c625fd5b0fc0d74e2923803a6c66b0ff1fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 14 Dec 2021 14:46:48 +0100 Subject: [PATCH 089/174] better implementation of averaged benchmark results --- src/gsIO/gsBenchmark.h | 67 +++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index ae63f4f90e..132c15ec18 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -150,7 +150,7 @@ typedef std::array Result; run(const std::vector& nthreads, index_t nruns, T& benchmark, metric metric) { gsStopwatch stopwatch; - uint64_t benchmark_result; + uint64_t benchmark_result(0); double benchmark_metric, benchmark_runtime; std::vector results; @@ -160,46 +160,47 @@ typedef std::array Result; omp_set_num_threads(*it); benchmark_runtime = 0.0; - benchmark_metric = 0.0; + + stopwatch.restart(); for (index_t run=0; run(*it); // number of OpenMP threads - res[1]= benchmark_runtime/(double)nruns; // averaged elapsed time in seconds - res[2]= benchmark_metric/(double)nruns; // averaged benchmark metric - res[3]= (double)metric; // benchmark metric + res[0]= static_cast(*it); // number of OpenMP threads + res[1]= benchmark_runtime; // averaged elapsed time in seconds + res[2]= benchmark_metric; // averaged benchmark metric + res[3]= (double)metric; // benchmark metric results.push_back( give(res) ); } } catch(...) {} From 4a7c7278eb84f62a7374c27f048efd60fa8503f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 14 Dec 2021 14:47:17 +0100 Subject: [PATCH 090/174] C++ zip implementation to iterate over multiple iterators simultaneously --- src/gsUtils/gsUtils.h | 120 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/gsUtils/gsUtils.h b/src/gsUtils/gsUtils.h index 0997a55369..d488a8b6f4 100644 --- a/src/gsUtils/gsUtils.h +++ b/src/gsUtils/gsUtils.h @@ -15,6 +15,9 @@ #include #include +#include +#include +#include #include #include @@ -294,6 +297,123 @@ size_t size(const T& t) } #endif +#if __cplusplus >= 201300L +template +using integer_sequence = std::integer_sequence; + +template +using index_sequence = std::index_sequence; + +template +using make_integer_sequence = std::make_integer_sequence; + +template +using make_index_sequence = std::make_index_sequence; + +template +using index_sequence_for = std::index_sequence_for; + +#else + +/// \brief Backport of std::integer_sequence from C++14 +template +struct integer_sequence +{ + typedef T value_type; + static constexpr std::size_t size() { return sizeof...(Ints); } +}; + +/// \brief Backport of std::index_sequence from C++14 +template +using index_sequence = integer_sequence; + +/// \brief Backport of std::make_integer_sequence from C++14 +//@{ +template +struct make_integer_sequence : make_integer_sequence {}; + +template +struct make_integer_sequence : integer_sequence {}; + +template +using make_index_sequence = make_integer_sequence; +//@} + +/// \brief Backport of std::index_sequence_for from C++14 +template +using index_sequence_for = make_index_sequence; +#endif + +namespace +{ + +template +class zip_helper { +public: + class iterator + : std::iterator().begin())...>> { + private: + std::tuple().begin())...> iters_; + + template + auto deref(index_sequence) + -> decltype(typename iterator::value_type{*std::get(iters_)...}) + const { + return typename iterator::value_type{*std::get(iters_)...}; + } + + template + void increment(index_sequence) { + auto l = {(++std::get(iters_), 0)...}; + GISMO_UNUSED(l); + } + + public: + explicit iterator(decltype(iters_) iters) : iters_{std::move(iters)} {} + + iterator& operator++() { + increment(index_sequence_for{}); + return *this; + } + + iterator operator++(int) { + auto saved{*this}; + increment(index_sequence_for{}); + return saved; + } + + bool operator!=(const iterator& other) const { + return iters_ != other.iters_; + } + + auto operator*() + -> decltype(deref(index_sequence_for{})) + const { return deref(index_sequence_for{}); } + }; + + zip_helper(T&... seqs) + : begin_{std::make_tuple(seqs.begin()...)}, + end_{std::make_tuple(seqs.end()...)} {} + + iterator begin() const { return begin_; } + iterator end() const { return end_; } + +private: + iterator begin_; + iterator end_; +}; + +} // namespace + +/// \brief Creates a zip iterator +template +auto zip(T&&... seqs) + -> zip_helper +{ + return zip_helper{seqs...}; +} + } // end namespace util // This macro assumes the operators == and < to be present and From 025a2e214a85b4e6fcbe20b80b2bdad473fe3f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 16 Dec 2021 20:48:23 +0100 Subject: [PATCH 091/174] small fixes --- src/gsIO/gsCmdLine.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gsIO/gsCmdLine.cpp b/src/gsIO/gsCmdLine.cpp index cbbd378432..7f39970323 100644 --- a/src/gsIO/gsCmdLine.cpp +++ b/src/gsIO/gsCmdLine.cpp @@ -435,7 +435,10 @@ void gsCmdLine::printVersion() << (gsSysInfo::getExtraLibsVersion().empty() ? ")\n" : ", "+gsSysInfo::getExtraLibsVersion()+")\n"); gsInfo << "Running on " << gsSysInfo::getCpuInfo() - << " (memory " << gsSysInfo::getMemoryInfo() << ")\n"; + << " (memory " << gsSysInfo::getMemoryInfo() << ")" + << " with real_t:" << util::type::name() + << ", index_t:" << util::type::name() + << ", short_t:" << util::type::name() << "\n"; gsInfo << "web: http://github.com/gismo\n"; } From 126b30666df20d6c418ed94798baef2e75fde796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 16 Dec 2021 22:15:52 +0100 Subject: [PATCH 092/174] small fixes --- src/gsIO/gsBenchmark.cpp | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index a6e017706b..51ed3daad8 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -60,31 +60,31 @@ namespace gismo switch( (int)it->at(3) ) { case metric::bandwidth_kb_sec: - os << "ylabel={Bandwidth in KB/s},\n"; + os << "ylabel={Bandwidth in KB/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::bandwidth_mb_sec: - os << "ylabel={Bandwidth in MB/s},\n"; + os << "ylabel={Bandwidth in MB/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::bandwidth_gb_sec: - os << "ylabel={Bandwidth in GB/s},\n"; + os << "ylabel={Bandwidth in GB/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::bandwidth_tb_sec: - os << "ylabel={Bandwidth in TB/s},\n"; + os << "ylabel={Bandwidth in TB/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::perf_kflop_sec: - os << "ylabel={Berformance in kFLOP/s},\n"; + os << "ylabel={Berformance in kFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::perf_mflop_sec: - os << "ylabel={Berformance in mFLOP/s},\n"; + os << "ylabel={Berformance in mFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::perf_gflop_sec: - os << "ylabel={Berformance in gFLOP/s},\n"; + os << "ylabel={Berformance in gFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::perf_tflop_sec: - os << "ylabel={Berformance in tFLOP/s},\n"; + os << "ylabel={Berformance in tFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; break; case metric::runtime_sec: - os << "ylabel={Runtime in seconds},\n"; + os << "ylabel={($\\longleftarrow$ \\textit{smaller is better}) Runtime in seconds},\n"; break; default: GISMO_ERROR("Unsupported metric"); @@ -121,9 +121,12 @@ namespace gismo << "\\end{axis}\n" + << "\\gettikzxy{(MyAxis.south west)}{\\ax}{\\ay}\n" + << "\\gettikzxy{(MyAxis.outer south east)}{\\bx}{\\by}\n" + << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " - << "node[below right, align=left, text=black, text width=\\x2-\\x1]\n" - << "at ($(MyAxis.south west)+(0,-100pt)$) {%\n" + << "node[draw,below right, align=left, text=black, text width=\\x2-\\x1-10pt, minimum width=\\x2-\\x1]\n" + << "at ($(\\ax, \\by-10pt)$) {%\n" << "G+Smo " << gsSysInfo::getGismoVersion() << ", Eigen " << gsSysInfo::getEigenVersion() << " (" << gsSysInfo::getCompilerVersion() @@ -161,6 +164,13 @@ namespace gismo << "\\usepackage{pgfplotstable}\n" << "\\usepackage{verbatim}\n" << "\\pgfplotsset{compat=1.18}\n" + << "\\makeatletter\n" + << "\\newcommand{\\gettikzxy}[3]{%\n" + << "\\tikz@scan@one@point\\pgfutil@firstofone#1\\relax\n" + << "\\edef#2{\\the\\pgf@x}%\n" + << "\\edef#3{\\the\\pgf@y}%\n" + << "}\n" + << "\\makeatother\n" << "\\begin{document}\n" << "\\usetikzlibrary{calc}\n"; From ff3febd8fb94adf5f57ce64466b3c820286c23f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 16 Dec 2021 22:16:20 +0100 Subject: [PATCH 093/174] util::to_string for std::tuples --- src/gsUtils/gsUtils.h | 50 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/src/gsUtils/gsUtils.h b/src/gsUtils/gsUtils.h index d488a8b6f4..de0552b3e0 100644 --- a/src/gsUtils/gsUtils.h +++ b/src/gsUtils/gsUtils.h @@ -13,9 +13,10 @@ #pragma once -#include -#include +#include #include +#include +#include #include #include @@ -41,7 +42,7 @@ namespace gismo */ namespace util { - + #if __cplusplus >= 201103L || _MSC_VER >= 1600 template // we catch up char arrays std::string to_string(C (& value)[N]) @@ -72,7 +73,7 @@ std::string to_string(const C & value, int digits) convert << std::scientific << std::setprecision(digits) << value; return convert.str(); } - + /// \brief Checks if a string \a haystack begins with the string \a needle /// \ingroup Utils inline bool starts_with( const std::string & haystack, const std::string & needle ) @@ -344,7 +345,7 @@ template using index_sequence_for = make_index_sequence; #endif -namespace +namespace // anonymous namespace { template @@ -404,7 +405,7 @@ class zip_helper { iterator end_; }; -} // namespace +} // end anonymous namespace /// \brief Creates a zip iterator template @@ -414,6 +415,43 @@ auto zip(T&&... seqs) return zip_helper{seqs...}; } +namespace // anonymous +{ +template +std::ostringstream& tuple_to_stream(std::ostringstream &oss, T &&arg) { + oss << arg; + return oss; +} + +template +std::ostringstream& tuple_to_stream(std::ostringstream &oss, First &&firstArg, Rest &&... restArgs) { + oss << firstArg << ", "; + return tuple_to_stream(oss, std::forward(restArgs)...); +} + +template +std::string tuple_to_string(Types &&... args) { + std::ostringstream oss; + oss << '['; + tuple_to_stream(oss, std::forward(args)...); + oss << ']'; + return oss.str(); +} + +template +std::string tuple_to_string_cxx11_compatibility(const Tuple &tuple, util::index_sequence) { + return tuple_to_string(std::get(tuple)...); +}; + +} // end anonymous namespace + +/// \brief Converts tuple to string, assuming "operator<<" defined on all items +/// \ingroup Utils +template +std::string to_string(const std::tuple &tuple) { + return tuple_to_string_cxx11_compatibility(tuple, util::make_index_sequence{}); +}; + } // end namespace util // This macro assumes the operators == and < to be present and From 1158ef5efa2e597aa7247b21a63c1d4c94b4e174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 16:55:26 +0100 Subject: [PATCH 094/174] deleted trailing whitespaces --- src/gsCore/gsSysInfo.cpp | 62 ++++++++++++++++++++-------------------- src/gsCore/gsSysInfo.h | 6 ++-- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index a0cdd8de14..42e0f26d9c 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -55,7 +55,7 @@ namespace gismo // return the compiler version in the specific CMake format #define DEC(n) n #define HEX(n) n - + /* Version number components: V=Version, R=Revision, P=Patch Version date components: YYYY=Year, MM=Month, DD=Day */ @@ -424,7 +424,7 @@ namespace gismo #else /* unknown compiler */ # define COMPILER_ID "Unknown-Compiler" #endif - + return util::to_string(COMPILER_ID) #ifdef COMPILER_VERSION +" "+util::to_string(COMPILER_VERSION); @@ -583,21 +583,21 @@ namespace gismo + "."+util::to_string(SPECTRA_MINOR_VERSION) + "."+util::to_string(SPECTRA_PATCH_VERSION); #endif - + return s; } std::string gsSysInfo::getCpuInfo() { #if defined(_WIN32) || defined(_WIN64) - + int CPUInfo[4] = {-1}; unsigned nExIds, i = 0; char CPUBrandString[0x40]; - + __cpuid(CPUInfo, 0x80000000); nExIds = CPUInfo[0]; - + for (i=0x80000000; i<=nExIds; ++i) { __cpuid(CPUInfo, i); if (i == 0x80000002) @@ -607,38 +607,38 @@ namespace gismo else if (i == 0x80000004) memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); } - + return CPUBrandString; - + #elif __APPLE__ std::string CPUBrandString; std::size_t size = 32; - + // Supply an oversized buffer, and avoid an extra call to sysctlbyname. CPUBrandString.resize(size); if (sysctlbyname("machdep.cpu.brand_string", &CPUBrandString[0], &size, NULL, 0) == 0 && size > 0) { if (CPUBrandString[size-1] == '\0') size--; CPUBrandString.resize(size); - return CPUBrandString; + return CPUBrandString; } - + #elif __linux__ # if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) char CPUBrandString[0x40]; unsigned int CPUInfo[4] = {0,0,0,0}; - + __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); unsigned int nExIds = CPUInfo[0]; - + memset(CPUBrandString, 0, sizeof(CPUBrandString)); - + for (unsigned int i = 0x80000000; i <= nExIds; ++i) { __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); - + if (i == 0x80000002) memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); else if (i == 0x80000003) @@ -646,7 +646,7 @@ namespace gismo else if (i == 0x80000004) memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); } - + return CPUBrandString; # else @@ -655,15 +655,15 @@ namespace gismo gethostname(hostname, HOST_NAME_MAX + 1); return "Unknown-CPU ["+hostname+"]"; - + # endif - + #elif __unix__ - + // No generic implementation yet - + #endif - + return "Unknown-CPU"; } @@ -683,7 +683,7 @@ namespace gismo else return "Unknown-Memory"; } - + uint64_t gsSysInfo::getMemoryInBytes() { #if defined(_WIN32) || defined(_WIN64) @@ -692,29 +692,29 @@ namespace gismo status.dwLength = sizeof(status); GlobalMemoryStatusEx(&status); return (uint64_t)status.ullTotalPhys; - + #elif __APPLE__ - + int64_t memsize; std::size_t size = sizeof(memsize); - + if (sysctlbyname("hw.memsize", &memsize, &size, NULL, 0) == 0) { return (uint64_t)memsize; } - + #elif __linux__ - + long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGE_SIZE); return (uint64_t)(pages * page_size); - + #elif __unix__ // No generic implementation yet - + #endif - + return 0; } - + } // namespace gismo diff --git a/src/gsCore/gsSysInfo.h b/src/gsCore/gsSysInfo.h index b1db96819b..6d2fa9e9ab 100644 --- a/src/gsCore/gsSysInfo.h +++ b/src/gsCore/gsSysInfo.h @@ -21,7 +21,7 @@ namespace gismo class GISMO_EXPORT gsSysInfo { public: - + /// Returns the version of G+Smo static std::string getGismoVersion(); @@ -47,7 +47,7 @@ namespace gismo static std::string getMemoryInfo(); /// Returns total system memory in bytes - static uint64_t getMemoryInBytes(); + static uint64_t getMemoryInBytes(); }; // class gsSysInfo - + } // namespace gismo From 70e762f905753dbb645de6545e43dae294c3e186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 16:55:43 +0100 Subject: [PATCH 095/174] small fixes --- src/gsIO/gsBenchmark.cpp | 105 +++++++++++++++++++++++---------------- src/gsIO/gsBenchmark.h | 68 +++++++++++++------------ 2 files changed, 100 insertions(+), 73 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 51ed3daad8..95d067525a 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -52,44 +52,65 @@ namespace gismo for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) os << (*rit)->get_title() << (rit!=results.cend()-1 ? "," : ""); - + os << "},\n" << "xtick=data,\n"; auto it = results.front()->get().cbegin(); - switch( (int)it->at(3) ) - { - case metric::bandwidth_kb_sec: - os << "ylabel={Bandwidth in KB/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::bandwidth_mb_sec: - os << "ylabel={Bandwidth in MB/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::bandwidth_gb_sec: - os << "ylabel={Bandwidth in GB/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::bandwidth_tb_sec: - os << "ylabel={Bandwidth in TB/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::perf_kflop_sec: - os << "ylabel={Berformance in kFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::perf_mflop_sec: - os << "ylabel={Berformance in mFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::perf_gflop_sec: - os << "ylabel={Berformance in gFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::perf_tflop_sec: - os << "ylabel={Berformance in tFLOP/s (\\textit{larger is better $\\longrightarrow$})},\n"; - break; - case metric::runtime_sec: - os << "ylabel={($\\longleftarrow$ \\textit{smaller is better}) Runtime in seconds},\n"; - break; - default: - GISMO_ERROR("Unsupported metric"); + if ((metric)it->at(3) & metric::speedup) { + switch( (int)it->at(3) & ~metric::speedup ) { + case metric::bandwidth_kb_sec: + case metric::bandwidth_mb_sec: + case metric::bandwidth_gb_sec: + case metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth [speedup]},\n"; + break; + case metric::perf_kflop_sec: + case metric::perf_mflop_sec: + case metric::perf_gflop_sec: + case metric::perf_tflop_sec: + os << "ylabel={Performance [speedup]},\n"; + break; + case metric::runtime_sec: + os << "ylabel={Runtime [speedup]},\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } else { + switch( (int)it->at(3) & ~metric::speedup ) { + case metric::bandwidth_kb_sec: + os << "ylabel={Bandwidth in KB/s},\n"; + break; + case metric::bandwidth_mb_sec: + os << "ylabel={Bandwidth in MB/s},\n"; + break; + case metric::bandwidth_gb_sec: + os << "ylabel={Bandwidth in GB/s},\n"; + break; + case metric::bandwidth_tb_sec: + os << "ylabel={Bandwidth in TB/s},\n"; + break; + case metric::perf_kflop_sec: + os << "ylabel={Performance in kFLOP/s},\n"; + break; + case metric::perf_mflop_sec: + os << "ylabel={Performance in mFLOP/s},\n"; + break; + case metric::perf_gflop_sec: + os << "ylabel={Performance in gFLOP/s},\n"; + break; + case metric::perf_tflop_sec: + os << "ylabel={Performance in tFLOP/s},\n"; + break; + case metric::runtime_sec: + os << "ylabel={Runtime in seconds},\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } } - + os << "title={" << title << "},\n" << "]\n"; @@ -116,14 +137,14 @@ namespace gismo it = results.front()->get().cbegin(); auto ite = results.front()->get().cend(); for (;it!=ite; ++it) - os << "Threads=" << it->at(0) << (it!=ite-1 ? "," : ""); + os << "Threads=" << it->at(0) << (it!=ite-1 ? "," : ""); os << "}\n" - + << "\\end{axis}\n" << "\\gettikzxy{(MyAxis.south west)}{\\ax}{\\ay}\n" << "\\gettikzxy{(MyAxis.outer south east)}{\\bx}{\\by}\n" - + << "\\path let \\p1=(MyAxis.west), \\p2=(MyAxis.east) in " << "node[draw,below right, align=left, text=black, text width=\\x2-\\x1-10pt, minimum width=\\x2-\\x1]\n" << "at ($(\\ax, \\by-10pt)$) {%\n" @@ -142,7 +163,7 @@ namespace gismo gsJITCompilerConfig jit; jit.load(GISMO_CONFIG_DIR "jit.xml"); std::string flags = jit.getFlags(); os << "Compiler flags "; - + for (auto token=strtok(&flags[0], " "); token!=NULL; token=strtok(NULL, " ")) { if (token[0]=='-') { if (token[1]=='I' || token[1]=='L' || token[1]=='l' || token[1]=='W') @@ -150,7 +171,7 @@ namespace gismo os << "\\verb!" << token << "! "; } } - + os << "};\n" << "\\end{tikzpicture}\n"; @@ -170,15 +191,15 @@ namespace gismo << "\\edef#2{\\the\\pgf@x}%\n" << "\\edef#3{\\the\\pgf@y}%\n" << "}\n" - << "\\makeatother\n" + << "\\makeatother\n" << "\\begin{document}\n" << "\\usetikzlibrary{calc}\n"; - + for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) (*it)->print(os); - + os << "\\end{document}\n"; return os; } - + } // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 132c15ec18..cf46e8528f 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -18,22 +18,23 @@ namespace gismo { - + /** * Benchmark metrics */ -enum metric { - bandwidth_kb_sec, - bandwidth_mb_sec, - bandwidth_gb_sec, - bandwidth_tb_sec, - perf_kflop_sec, - perf_mflop_sec, - perf_gflop_sec, - perf_tflop_sec, - runtime_sec, -}; - + enum metric { + speedup = 0x1, + bandwidth_kb_sec = 10, + bandwidth_mb_sec = 11, + bandwidth_gb_sec = 12, + bandwidth_tb_sec = 13, + perf_kflop_sec = 14, + perf_mflop_sec = 15, + perf_gflop_sec = 16, + perf_tflop_sec = 17, + runtime_sec = 18 + }; + /** * Benchmark: driver function */ @@ -68,7 +69,7 @@ typedef std::array Result; const std::string& get_label() const { return label; } - + const std::string& get_title() const { return title; } @@ -111,7 +112,7 @@ typedef std::array Result; const std::string& get_label() const { return label; } - + const std::string& get_title() const { return title; } @@ -152,25 +153,25 @@ typedef std::array Result; gsStopwatch stopwatch; uint64_t benchmark_result(0); double benchmark_metric, benchmark_runtime; - + std::vector results; - + try { for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { - + omp_set_num_threads(*it); benchmark_runtime = 0.0; - + stopwatch.restart(); - + for (index_t run=0; run Result; GISMO_ERROR("Unsupported metric"); } - // if (std::isinf(benchmark_runtime)) - // benchmark_runtime = 0.0; - - // if (std::isinf(benchmark_metric)) - // benchmark_metric = 0.0; - Result res; res[0]= static_cast(*it); // number of OpenMP threads res[1]= benchmark_runtime; // averaged elapsed time in seconds @@ -204,10 +199,21 @@ typedef std::array Result; results.push_back( give(res) ); } } catch(...) {} - + + // Convert to relative values (speedup relative to first entry) + if (metric & metric::speedup) { + benchmark_runtime = results.front().at(1); + benchmark_metric = results.front().at(2); + + for (auto &it : results) { + it.at(1) = benchmark_runtime / it.at(1); + it.at(2) = benchmark_metric / it.at(2); + } + } + return results; } - + private: std::vector benchmarks; }; From ae84e98916cf29083a71e9c2031297495e2aa127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 16:55:56 +0100 Subject: [PATCH 096/174] small fixes --- examples/performance_benchmark.cpp | 382 +++++++++++++++++++++++------ 1 file changed, 302 insertions(+), 80 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 191d851162..b7c4c55f12 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -17,40 +17,79 @@ using namespace gismo; //! [Include namespace] -//! [Implement benchmark macro] -#define CREATE_BENCHMARK(_benchmark, _label, _sizes, _metric) \ - gsInfo << "=== " << _benchmark::name() << "\n"; \ - auto bmark = benchmark.add(_label, _benchmark::name()); \ - auto riter = nruns.cbegin(); \ - for (auto it=_sizes.cbegin(); it!=_sizes.cend(); ++it) { \ - gsInfo << "... " << (*it) << "(" << *riter << ")"<< std::flush; \ - try { \ - _benchmark benchmark(*it); \ - auto results = gsBenchmark::run(nthreads, *riter++, benchmark, _metric); \ - std::string meminfo; \ - uint64_t memsize = benchmark.size(); \ - if (memsize<1024) \ - meminfo = util::to_string(memsize)+" B"; \ - else if (memsize<1024*1024) \ - meminfo = util::to_string(memsize/1024)+" KB"; \ - else if (memsize<1024*1024*1024) \ - meminfo = util::to_string(memsize/(1024*1024))+" MB"; \ - else \ - meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; \ - bmark->add(_label, meminfo, results); \ - } catch(...) { gsInfo << "[failed!]"; } \ - gsInfo << "\n"; \ - } -//! [Implement benchmark macro] +//! [Implement test creator] +template +void create_test(const std::string& label, + const Iterator& sizes, + const std::vector& nruns, + const std::vector& nthreads, + gsBenchmark& benchmark) +{ + gsInfo << "=== " << Test::name() << "\n"; + auto bmark = benchmark.add(label, Test::name()); + auto riter = nruns.begin(); + for (auto it : sizes) { + gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; + try { + Test test(it); + auto results = gsBenchmark::run(nthreads, *riter++, test, Test::metric()); + std::string meminfo; + uint64_t memsize = test.size(); + if (memsize<1024) + meminfo = util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + meminfo = util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + meminfo = util::to_string(memsize/(1024*1024))+" MB"; + else + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; + bmark->add(label, meminfo, results); + } catch(...) { gsInfo << "[failed!]"; } + gsInfo << "\n"; + } +} + +template +void create_test(const std::string& label, + const util::zip_helper& sizes, + const std::vector& nruns, + const std::vector& nthreads, + gsBenchmark& benchmark) +{ + gsInfo << "=== " << Test::name() << "\n"; + auto bmark = benchmark.add(label, Test::name()); + auto riter = nruns.begin(); + for (auto it : sizes) { + gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; + try { + Test test(it); + auto results = gsBenchmark::run(nthreads, *riter++, test, Test::metric()); + std::string meminfo; + uint64_t memsize = test.size(); + if (memsize<1024) + meminfo = util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + meminfo = util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + meminfo = util::to_string(memsize/(1024*1024))+" MB"; + else + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; + bmark->add(label, meminfo, results); + } catch(...) { gsInfo << "[failed!]"; } + gsInfo << "\n"; + } +} +//! [Implement test creator] //! [Implement memory safeguard] template class memory_safeguard { public: - memory_safeguard(index_t n) + template + memory_safeguard(Args... args) { - if (T::size(n) > gsSysInfo::getMemoryInBytes()) + if (T::size(args...) > gsSysInfo::getMemoryInBytes()) GISMO_ERROR("Insufficient memory"); } }; @@ -110,6 +149,11 @@ class benchmark_c_array_memcopy { return "Memory copy (native C array)"; } + + static constexpr gismo::metric metric() + { + return metric::bandwidth_gb_sec; + } }; //! [Implement benchmark native C array memcopy] @@ -171,6 +215,11 @@ class benchmark_c_array_dotproduct { return "Dot-product (native C array)"; } + + static constexpr gismo::metric metric() + { + return metric::bandwidth_gb_sec; + } }; //! [Implement benchmark native C array dot-product] @@ -233,6 +282,11 @@ class benchmark_c_array_axpy { return "AXPY (native C array)"; } + + static constexpr gismo::metric metric() + { + return metric::bandwidth_gb_sec; + } }; //! [Implement benchmark native C array AXPY] @@ -270,9 +324,10 @@ class benchmark_c_array_dense_matmul index_t operator()() { +#pragma omp parallel for for (index_t i=0; i class benchmark_poisson2d_visitor { private: memory_safeguard _msg; + int numPatches, numRefine, degree; gsMultiPatch geo; gsMultiBasis bases; gsConstantFunction f; @@ -508,44 +589,158 @@ class benchmark_poisson2d_visitor gsPoissonAssembler assembler; public: - benchmark_poisson2d_visitor(int npatches, int refine=0, int degree=1) - : _msg(0), geo(gsNurbsCreator<>::BSplineSquareGrid(npatches, npatches, 1.0)), + template + benchmark_poisson2d_visitor(std::tuple args) + : benchmark_poisson2d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson2d_visitor(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineSquareGrid(numPatches, numPatches, 1.0)), bases(geo), f(0.0, 0.0, 2) { // h-refine each basis - for (int i = 0; i < refine; ++i) + for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); // k-refinement (set degree) for (std::size_t i = 0; i < bases.nBases(); ++ i) bases[i].setDegreePreservingMultiplicity(degree); + // create assembler assembler = gsPoissonAssembler(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); } index_t operator()() { assembler.assemble(); + gsInfo << numPatches << ":" << numRefine << ":" << degree << " = " << assembler.rhs().rows() << std::endl; + return sizeof(T) * (assembler.matrix().nonZeros() + assembler.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^2 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) +1 ) * + (/* numPatches^2 * DOFs per patch */ + math::pow(numPatches,2) * math::pow((1< +class benchmark_poisson3d_visitor +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsConstantFunction f; + gsBoundaryConditions bcInfo; + gsPoissonAssembler assembler; +public: + template + benchmark_poisson3d_visitor(std::tuple args) + : benchmark_poisson3d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson3d_visitor(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineCubeGrid(numPatches, numPatches, numPatches, 1.0)), + bases(geo), f(0.0, 0.0, 0.0, 3) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // create assembler + assembler = gsPoissonAssembler(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); + } + + index_t operator()() + { + assembler.assemble(); return sizeof(T) * assembler.numDofs(); } constexpr uint64_t size() const { - return size(0); + return size(numPatches, numRefine, degree); } - static constexpr uint64_t size(index_t n) + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) { - return sizeof(T); + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^3 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * (numPatches * ((1< +std::vector make_vector(T value, std::size_t size) +{ + std::vector v; + for (std::size_t i=0; i benchmarks, nruns, nthreads, msizes, vsizes; - real_t nrunsfactor = 1.5; - real_t msizesfactor = 10; - real_t vsizesfactor = 10; - index_t nrunsmax = 50; - index_t nrunsmin = 1; + std::vector benchmarks, nruns, nthreads, asizes, msizes, vsizes; + index_t asizesmin = 1; + index_t asizesmax = 8; index_t msizesmin = 10; + index_t nrunsmin = 1; + index_t nrunsmax = 100; index_t vsizesmin = 100; + real_t msizesfactor = 2; + real_t nrunsfactor = 1.5; + real_t vsizesfactor = 4; index_t msizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); index_t vsizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), @@ -572,12 +769,15 @@ int main(int argc, char *argv[]) cmd.addReal("M", "msizesfactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizesfactor); cmd.addReal("V", "vsizesfactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizesfactor); cmd.addReal("R", "runsfactor", "Growth factor for the sequence of runs (only used if '-r' is not given)", nrunsfactor); + cmd.addInt("", "asizesmax", "Maximum number of refinements (patch,refine,degree) in assembly benchmarks (only used if '-a' is not given)", asizesmax); + cmd.addInt("", "asizesmin", "Mminimum number of refinements (patch,refine,degree) in assembly benchmarks (only used if '-a' is not given)", asizesmin); cmd.addInt("", "msizesmax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizesmax); cmd.addInt("", "msizesmin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizesmin); cmd.addInt("", "vsizesmax", "Maximum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmax); cmd.addInt("", "vsizesmin", "Mminimum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmin); cmd.addInt("", "runsmax", "Maximum number of runs (only used if '-r' is not given)", nrunsmax); cmd.addInt("", "runsmin", "Mminimum number of runs (only used if '-r' is not given)", nrunsmin); + cmd.addMultiInt("a", "asizes", "Number of refinements (patch,refine,degree) in assembly benchmarks (auto-generated if not given)", asizes); cmd.addMultiInt("b", "benchmarks", "List of benchmarks to be run", benchmarks); cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks (auto-generated if not given)", msizes); cmd.addMultiInt("r", "runs", "Number of runs over which the results are averaged (auto-generated if not given)", nruns); @@ -585,39 +785,46 @@ int main(int argc, char *argv[]) cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks (auto-generated if not given)", vsizes); cmd.addString("o", "output", "Name of the output file", fn); cmd.addSwitch("list", "List all benchmarks and exit", list); - + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } //! [Parse command line] //! [List benchmarks and exit] if (list) { gsInfo << "\nThe following benchmarks are available:\n" - << "#1: " << benchmark_c_array_memcopy::name() << "\n" - << "#2: " << benchmark_eigen_memcopy::name() << "\n" - << "#3: " << benchmark_c_array_dotproduct::name() << "\n" - << "#4: " << benchmark_eigen_dotproduct::name() << "\n" - << "#5: " << benchmark_c_array_axpy::name() << "\n" - << "#6: " << benchmark_eigen_axpy::name() << "\n" - << "#7: " << benchmark_c_array_dense_matmul::name() << "\n" - << "#8: " << benchmark_eigen_dense_matmul::name() << "\n" - << "#9: " << benchmark_poisson2d_visitor::name() << "\n"; + << "#01: " << benchmark_c_array_memcopy::name() << "\n" + << "#02: " << benchmark_eigen_memcopy::name() << "\n" + << "#03: " << benchmark_c_array_dotproduct::name() << "\n" + << "#04: " << benchmark_eigen_dotproduct::name() << "\n" + << "#05: " << benchmark_c_array_axpy::name() << "\n" + << "#06: " << benchmark_eigen_axpy::name() << "\n" + << "#07: " << benchmark_c_array_dense_matmul::name() << "\n" + << "#08: " << benchmark_eigen_dense_matmul::name() << "\n" + << "#09: " << benchmark_poisson2d_visitor::name() << "\n" + << "#10: " << benchmark_poisson3d_visitor::name() << "\n"; return EXIT_SUCCESS; } //! [List benchmarks and exit] - + //! [Default configuration] - // If empty fill with all benchmarks 1, ..., 5 + // If empty fill with all benchmarks 1, 2, ... if (benchmarks.empty()) { - for(index_t i=1; i<=9; ++i) + for(index_t i=1; i<=8; ++i) benchmarks.push_back(i); } - + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(index_t i=1; i<=omp_get_max_threads(); i*=2) nthreads.push_back(i); } + // If empty fill with asizesmin, ..., asizesmax + if (asizes.empty()) { + for(index_t i=asizesmin; i > + ("memcopyCarray", vsizes, nruns, nthreads, benchmark); break; } - + case (2): { // Benchmark: memcopy gsVector - CREATE_BENCHMARK(benchmark_eigen_memcopy, "memcopyEigen", - vsizes, metric::bandwidth_gb_sec); + create_test > + ("memcopyEigen", vsizes, nruns, nthreads, benchmark); break; } case (3): { // Benchmark: dot-product native C array - CREATE_BENCHMARK(benchmark_c_array_dotproduct, "dotproductCarray", - vsizes, metric::bandwidth_gb_sec); + create_test > + ("dotproductCarray", vsizes, nruns, nthreads, benchmark); break; } case (4): { // Benchmark: dot-product gsVector - CREATE_BENCHMARK(benchmark_eigen_dotproduct, "dotproductEigen", - vsizes, metric::bandwidth_gb_sec); + create_test > + ("dotproductEigen", vsizes, nruns, nthreads, benchmark); break; } case (5): { // Benchmark: axpy native C array - CREATE_BENCHMARK(benchmark_c_array_axpy, "axpyCarray", - vsizes, metric::bandwidth_gb_sec); + create_test > + ("axpyCarray", vsizes, nruns, nthreads, benchmark); break; } case (6): { // Benchmark: axpy gsVector - CREATE_BENCHMARK(benchmark_eigen_axpy, "axpyEigen", - vsizes, metric::bandwidth_gb_sec); + create_test > + ("axpyEigen", vsizes, nruns, nthreads, benchmark); break; } case (7): { // Benchmark: dense matrix-vector multiplication native C array - CREATE_BENCHMARK(benchmark_c_array_dense_matmul, "densematmulCarray", - msizes, metric::bandwidth_gb_sec); + create_test > + ("densematmulCarray", msizes, nruns, nthreads, benchmark); break; } case (8): { // Benchmark: dense matrix-vector multiplication gsMatrix/gsVector - CREATE_BENCHMARK(benchmark_eigen_dense_matmul, "densematmulEigen", - msizes, metric::bandwidth_gb_sec); + create_test > + ("densematmulEigen", msizes, nruns, nthreads, benchmark); break; } case (9): { - // Benchmark: visitor-based Poisson 2D assembly - CREATE_BENCHMARK(benchmark_poisson2d_visitor, "assemblerVisitor", - vsizes, metric::bandwidth_gb_sec); + + std::vector a = {0,3};//,4,7,8,5}; + std::vector b = {32,16};//,8,4,2,1}; + + // Benchmark: visitor-based Poisson 2d assembler + create_test > + ("assemblerVisitor", util::zip(b, + //make_vector(index_t(4), asizes.size()), + a, + make_vector(index_t(5), a.size())), + nruns, nthreads, benchmark); + break; + } + + case (10): { + // Benchmark: visitor-based Poisson 3d assembler + create_test > + ("assemblerVisitor", vsizes, nruns, nthreads, benchmark); break; } - default: GISMO_ERROR("Invalid benchmark"); } From 92ace9e049989addc430c580d45d9f50f6cd4a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 20:33:39 +0100 Subject: [PATCH 097/174] small fixes --- src/gsCore/gsDebug.h | 1 + src/gsIO/gsBenchmark.cpp | 5 ++++- src/gsIO/gsBenchmark.h | 10 +++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/gsCore/gsDebug.h b/src/gsCore/gsDebug.h index 01115132d8..93048a72ec 100644 --- a/src/gsCore/gsDebug.h +++ b/src/gsCore/gsDebug.h @@ -238,6 +238,7 @@ static const int gismo_set_abort_behavior = _set_abort_behavior( // typedef locally defined but not used [-Wunused-local-typedefs] #if ( __GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>7) ) #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#pragma GCC diagnostic ignored "-Wclass-memaccess" #endif #if (__cplusplus < 201703L && __GNUC__>6) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 95d067525a..061a48dd4a 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -111,7 +111,10 @@ namespace gismo } } - os << "title={" << title << "},\n" + os << "title={" << title + << " [real\\_t:" << util::type::name() + << ", index\\_t:" << util::type::name() + << ", short\\_t:" << util::type::name()<< "},\n" << "]\n"; for (auto rit=results.cbegin()+1; rit!=results.cend(); ++rit) diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index cf46e8528f..d0ba3a655a 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -50,7 +50,7 @@ class GISMO_EXPORT gsBenchmark /** * Benchmark result */ -typedef std::array Result; +typedef std::array Result; /** * Benchmark result set class @@ -152,7 +152,7 @@ typedef std::array Result; { gsStopwatch stopwatch; uint64_t benchmark_result(0); - double benchmark_metric, benchmark_runtime; + real_t benchmark_metric, benchmark_runtime; std::vector results; @@ -169,7 +169,7 @@ typedef std::array Result; } stopwatch.stop(); - benchmark_runtime = stopwatch.elapsed()/(double)nruns; + benchmark_runtime = stopwatch.elapsed()/(real_t)nruns; switch(metric & ~metric::speedup) { case metric::bandwidth_kb_sec: case metric::perf_kflop_sec: @@ -192,10 +192,10 @@ typedef std::array Result; } Result res; - res[0]= static_cast(*it); // number of OpenMP threads + res[0]= static_cast(*it); // number of OpenMP threads res[1]= benchmark_runtime; // averaged elapsed time in seconds res[2]= benchmark_metric; // averaged benchmark metric - res[3]= (double)metric; // benchmark metric + res[3]= (real_t)metric; // benchmark metric results.push_back( give(res) ); } } catch(...) {} From c8ecddc283fbfb67d311ed28deb5421db2c9f1cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 20:33:53 +0100 Subject: [PATCH 098/174] small fixes --- examples/performance_benchmark.cpp | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index b7c4c55f12..2001d5c497 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -122,7 +122,7 @@ class benchmark_c_array_memcopy delete[] m_y; } - index_t operator()() + uint64_t operator()() { #pragma omp parallel for simd for (index_t i=0; i(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); } - index_t operator()() + uint64_t operator()() { assembler.assemble(); gsInfo << numPatches << ":" << numRefine << ":" << degree << " = " << assembler.rhs().rows() << std::endl; @@ -632,7 +632,7 @@ class benchmark_poisson2d_visitor // // The factor 1.33 is used because Eigen shows better performance // if 33% more memory is allocated during the step-by-step assembly - return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) +1 ) * + return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) + 1 ) * (/* numPatches^2 * DOFs per patch */ math::pow(numPatches,2) * math::pow((1<(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); } - index_t operator()() + uint64_t operator()() { assembler.assemble(); return sizeof(T) * assembler.numDofs(); From 952c1cb7e4aa3bb71b451b1915443687228d382f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 17 Dec 2021 20:39:45 +0100 Subject: [PATCH 099/174] small fixes --- src/gsIO/gsBenchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 061a48dd4a..78cd715297 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -114,7 +114,7 @@ namespace gismo os << "title={" << title << " [real\\_t:" << util::type::name() << ", index\\_t:" << util::type::name() - << ", short\\_t:" << util::type::name()<< "},\n" + << ", short\\_t:" << util::type::name()<< "]},\n" << "]\n"; for (auto rit=results.cbegin()+1; rit!=results.cend(); ++rit) From ff4f3fa4482ebd87c683d4beea7d4475e25e81d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 19 Dec 2021 09:41:01 +0100 Subject: [PATCH 100/174] math::max and math::min with an arbitrary number of arguments --- src/gsCore/gsMath.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/gsCore/gsMath.h b/src/gsCore/gsMath.h index 87bfbeba07..1b96048c93 100644 --- a/src/gsCore/gsMath.h +++ b/src/gsCore/gsMath.h @@ -430,6 +430,16 @@ bool almostEqual(const T a, const T b) // static const double _2_pi = 0.636619772367581343076; // static const double _180_pi = 57.295779513082320876798; +// Maximum over three or more arguments +template +typename std::common_type::type max(const T a, const T b, const T c, const Ts... args) +{ return math::max(a, math::max(b,c,args...)); } + +// Minimum over three or more arguments +template +typename std::common_type::type min(const T a, const T b, const T c, const Ts... args) +{ return math::min(a, math::min(b,c,args...)); } + } //end namespace math /** From ef8e6598dddd5566a2139fa421844d67872ebdd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 19 Dec 2021 09:42:10 +0100 Subject: [PATCH 101/174] GISMO_HAS_OPENMPxy with xy being the OpenMP version number 2.5-5.2 --- src/gsParallel/gsOpenMP.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h index b584de760c..8f73d7a671 100644 --- a/src/gsParallel/gsOpenMP.h +++ b/src/gsParallel/gsOpenMP.h @@ -15,6 +15,24 @@ #ifdef _OPENMP +#ifcc _OPENMP >= 202111 +#define GISMO_HAS_OPENMP52 +#elif _OPENMP >= 202011 +#define GISMO_HAS_OPENMP51 +#elif _OPENMP >= 201811 +#define GISMO_HAS_OPENMP50 +#elif _OPENMP >= 201511 +#define GISMO_HAS_OPENMP45 +#elif _OPENMP >= 201307 +#define GISMO_HAS_OPENMP40 +#elif _OPENMP >= 201107 +#define GISMO_HAS_OPENMP31 +#elif _OPENMP >= 200805 +#define GISMO_HAS_OPENMP30 +#elif _OPENMP >= 200505 +#define GISMO_HAS_OPENMP25 +#endif + #include #else From dc7c1eb97916e977294909833f79bad422cdf656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 19 Dec 2021 09:42:49 +0100 Subject: [PATCH 102/174] small fixes --- examples/performance_benchmark.cpp | 304 +++++++++++++++++++++++------ 1 file changed, 240 insertions(+), 64 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 2001d5c497..590713a5b4 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -23,10 +23,11 @@ void create_test(const std::string& label, const Iterator& sizes, const std::vector& nruns, const std::vector& nthreads, - gsBenchmark& benchmark) + gsBenchmark& benchmark, + const std::string& extra_name="") { - gsInfo << "=== " << Test::name() << "\n"; - auto bmark = benchmark.add(label, Test::name()); + gsInfo << "=== " << Test::name() << extra_name << "\n"; + auto bmark = benchmark.add(label, Test::name()+extra_name); auto riter = nruns.begin(); for (auto it : sizes) { gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; @@ -54,10 +55,11 @@ void create_test(const std::string& label, const util::zip_helper& sizes, const std::vector& nruns, const std::vector& nthreads, - gsBenchmark& benchmark) + gsBenchmark& benchmark, + const std::string& extra_name="") { - gsInfo << "=== " << Test::name() << "\n"; - auto bmark = benchmark.add(label, Test::name()); + gsInfo << "=== " << Test::name() << extra_name << "\n"; + auto bmark = benchmark.add(label, Test::name()+extra_name); auto riter = nruns.begin(); for (auto it : sizes) { gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; @@ -585,7 +587,7 @@ class benchmark_poisson2d_visitor gsMultiPatch geo; gsMultiBasis bases; gsConstantFunction f; - gsBoundaryConditions bcInfo; + gsBoundaryConditions bc; gsPoissonAssembler assembler; public: @@ -609,13 +611,12 @@ class benchmark_poisson2d_visitor bases[i].setDegreePreservingMultiplicity(degree); // create assembler - assembler = gsPoissonAssembler(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); + assembler = gsPoissonAssembler(geo, bases, bc, f, dirichlet::nitsche, iFace::glue); } uint64_t operator()() { assembler.assemble(); - gsInfo << numPatches << ":" << numRefine << ":" << degree << " = " << assembler.rhs().rows() << std::endl; return sizeof(T) * (assembler.matrix().nonZeros() + assembler.rhs().rows()); } @@ -648,7 +649,7 @@ class benchmark_poisson2d_visitor static constexpr gismo::metric metric() { - return (gismo::metric)(metric::runtime_sec + metric::speedup); + return (gismo::metric)(metric::runtime_sec + 0*metric::speedup); } }; //! [Implement benchmark Poisson 2d visitor] @@ -666,7 +667,7 @@ class benchmark_poisson3d_visitor gsMultiPatch geo; gsMultiBasis bases; gsConstantFunction f; - gsBoundaryConditions bcInfo; + gsBoundaryConditions bc; gsPoissonAssembler assembler; public: @@ -690,13 +691,13 @@ class benchmark_poisson3d_visitor bases[i].setDegreePreservingMultiplicity(degree); // create assembler - assembler = gsPoissonAssembler(geo, bases, bcInfo, f, dirichlet::nitsche, iFace::glue); + assembler = gsPoissonAssembler(geo, bases, bc, f, dirichlet::nitsche, iFace::glue); } uint64_t operator()() { assembler.assemble(); - return sizeof(T) * assembler.numDofs(); + return sizeof(T) * (assembler.matrix().nonZeros() + assembler.rhs().rows()); } constexpr uint64_t size() const @@ -733,6 +734,116 @@ class benchmark_poisson3d_visitor }; //! [Implement benchmark Poisson 3d visitor] +//! [Implement benchmark Poisson 2d expression assembler] +/** + * Benchmark: Expression assembler-based Poisson 2d + */ +template +class benchmark_poisson2d_expression_assembler +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsBoundaryConditions bc; + + gsExprAssembler A; + typename gsExprAssembler<>::geometryMap G; + typename gsExprAssembler<>::space u; + + gsFunctionExpr f; + expr::gsComposition ff; + +public: + template + benchmark_poisson2d_expression_assembler(std::tuple args) + : benchmark_poisson2d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson2d_expression_assembler(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineSquareGrid(numPatches, numPatches, 1.0)), + bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), + f("0.0", 2), ff(A.getCoeff(f, G)) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // set the geometry map to boundary conditions + bc.setGeoMap(geo); + + // setup boundary conditions + u.setup(bc, dirichlet::l2Projection, 0); + + // set elements used for numerical integration + A.setIntegrationElements(bases); + + // initialize the system + A.initSystem(); + + // set the geometry map + //G = A.getMap(geo); + + // set the discretization space + //u = A.getSpace(bases); + + // set the source term + //auto ff = A.getCoeff(f, G); + } + + uint64_t operator()() + { + // Compute the system matrix and right-hand side + A.assemble( + igrad(u, G) * igrad(u, G).tr() * meas(G) //matrix + , + u * ff * meas(G) //rhs vector + ); + + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^2 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * ( 1.33 * math::pow(2*degree+1,2) + 1 ) * + (/* numPatches^2 * DOFs per patch */ + math::pow(numPatches,2) * math::pow((1< std::vector make_vector(T value, std::size_t size) { @@ -748,39 +859,46 @@ int main(int argc, char *argv[]) gsBenchmark benchmark; std::string fn; bool list=false; - std::vector benchmarks, nruns, nthreads, asizes, msizes, vsizes; - index_t asizesmin = 1; - index_t asizesmax = 8; - index_t msizesmin = 10; - index_t nrunsmin = 1; - index_t nrunsmax = 100; - index_t vsizesmin = 100; - real_t msizesfactor = 2; - real_t nrunsfactor = 1.5; - real_t vsizesfactor = 4; - index_t msizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), + std::vector benchmarks, msizes, nruns, nthreads, patches, subdivides, vsizes; + index_t msizemin = 10; + index_t nrunsmax = 100; + index_t nrunsmin = 1; + index_t patchesmax = 128; + index_t patchesmin = 1; + index_t subdividemax = 10; + index_t subdividemin = 0; + index_t vsizemin = 100; + real_t patchesfactor = 2; + real_t msizesfactor = 2; + real_t nrunsfactor = 1.5; + real_t vsizesfactor = 4; + index_t msizemax = (index_t) math::min((real_t)std::numeric_limits::max(), std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); - index_t vsizesmax = (index_t) std::min((real_t)std::numeric_limits::max(), + index_t vsizemax = (index_t) math::min((real_t)std::numeric_limits::max(), (real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes()); gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); cmd.addReal("M", "msizesfactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizesfactor); - cmd.addReal("V", "vsizesfactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizesfactor); + cmd.addReal("P", "patchesfactor", "Growth factor for the sequence of patches (only used if '-p' is not given)", patchesfactor); cmd.addReal("R", "runsfactor", "Growth factor for the sequence of runs (only used if '-r' is not given)", nrunsfactor); - cmd.addInt("", "asizesmax", "Maximum number of refinements (patch,refine,degree) in assembly benchmarks (only used if '-a' is not given)", asizesmax); - cmd.addInt("", "asizesmin", "Mminimum number of refinements (patch,refine,degree) in assembly benchmarks (only used if '-a' is not given)", asizesmin); - cmd.addInt("", "msizesmax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizesmax); - cmd.addInt("", "msizesmin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizesmin); - cmd.addInt("", "vsizesmax", "Maximum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmax); - cmd.addInt("", "vsizesmin", "Mminimum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizesmin); + cmd.addReal("V", "vsizesfactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizesfactor); + cmd.addInt("", "msizemax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizemax); + cmd.addInt("", "msizemin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizemin); + cmd.addInt("", "patchesmax", "Maximum number of patches in assembly benchmarks (only used if '-p' is not given)", patchesmax); + cmd.addInt("", "patchesmin", "Minimum number of patches in assembly benchmarks (only used if '-p' is not given)", patchesmin); cmd.addInt("", "runsmax", "Maximum number of runs (only used if '-r' is not given)", nrunsmax); cmd.addInt("", "runsmin", "Mminimum number of runs (only used if '-r' is not given)", nrunsmin); - cmd.addMultiInt("a", "asizes", "Number of refinements (patch,refine,degree) in assembly benchmarks (auto-generated if not given)", asizes); + cmd.addInt("", "subdividemax", "Maximum number of subdivisions (h-refinement) in assembly benchmarks (only used if '-r' is not given)", subdividemax); + cmd.addInt("", "subdividemin", "Minimum number of subdivisions (h-refinement) in assembly benchmarks (only used if '-r' is not given)", subdividemin); + cmd.addInt("", "vsizemax", "Maximum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizemax); + cmd.addInt("", "vsizemin", "Mminimum number of unknowns in vector benchmarks (only used if '-v' is not given)", vsizemin); cmd.addMultiInt("b", "benchmarks", "List of benchmarks to be run", benchmarks); cmd.addMultiInt("m", "msizes", "Number of unknowns in matrix/vector benchmarks (auto-generated if not given)", msizes); + cmd.addMultiInt("p", "patches", "Number of patches in assembly benchmarks (auto-generated if not given)", patches); cmd.addMultiInt("r", "runs", "Number of runs over which the results are averaged (auto-generated if not given)", nruns); + cmd.addMultiInt("s", "subdivide", "Number of subdivisions (h-refinement) in assembly benchmarks (auto-generated if not given)", subdivides); cmd.addMultiInt("t", "threads", "Number of OpenMP threads to be used for the benchmark (auto-generated if not given)", nthreads); cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks (auto-generated if not given)", vsizes); cmd.addString("o", "output", "Name of the output file", fn); @@ -800,8 +918,19 @@ int main(int argc, char *argv[]) << "#06: " << benchmark_eigen_axpy::name() << "\n" << "#07: " << benchmark_c_array_dense_matmul::name() << "\n" << "#08: " << benchmark_eigen_dense_matmul::name() << "\n" - << "#09: " << benchmark_poisson2d_visitor::name() << "\n" - << "#10: " << benchmark_poisson3d_visitor::name() << "\n"; + << "#09: " << benchmark_poisson2d_visitor::name() + << " with increasing number of patches" << "\n" + << "#10: " << benchmark_poisson2d_visitor::name() + << " with increasing number of subdivisions" << "\n" + << "#11: " << benchmark_poisson3d_visitor::name() + << " with increasing number of patches" << "\n" + << "#12: " << benchmark_poisson3d_visitor::name() + << " with increasing number of subdivisions" << "\n" + << "#13: " << benchmark_poisson2d_expression_assembler::name() + << " with increasing number of patches" << "\n" + << "#14: " << benchmark_poisson2d_expression_assembler::name() + << " with increasing number of subdivisions" << "\n"; + return EXIT_SUCCESS; } //! [List benchmarks and exit] @@ -812,35 +941,41 @@ int main(int argc, char *argv[]) for(index_t i=1; i<=8; ++i) benchmarks.push_back(i); } - + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(index_t i=1; i<=omp_get_max_threads(); i*=2) nthreads.push_back(i); } - // If empty fill with asizesmin, ..., asizesmax - if (asizes.empty()) { - for(index_t i=asizesmin; i::max()) / (msizesfactor*msizesfactor)) + if (i<=math::min(msizemax, std::numeric_limits::max()) / (msizesfactor*msizesfactor)) i*=msizesfactor; else break; } } - // If empty fill with vsizesmin*vsizesfactor^k, k=0, 1, 2, ..., vsizesmax + // If empty fill with patchesmin, ..., patchesmax + if (patches.empty()) { + for(index_t i=patchesmin; i<=patchesmax; i*=patchesfactor) + patches.push_back(i); + } + + // If empty fill with subdividemin, ..., subdividemax + if (subdivides.empty()) { + for(index_t i=subdividemin; i::max()) / vsizesfactor) + if (i<=math::min(vsizemax, std::numeric_limits::max()) / vsizesfactor) i*=vsizesfactor; else break; @@ -850,13 +985,14 @@ int main(int argc, char *argv[]) // If empty fill with nrunsmax/nrunsfactor^k, k=0, 1, 2, ..., nrunsmin if (nruns.empty()) { index_t k = nrunsmax; - for(index_t i=0; i<(index_t)std::max(msizes.size(),vsizes.size()); ++i) { + for(index_t i=0; i<(index_t)math::max(msizes.size(), patches.size(), + subdivides.size(), vsizes.size()); ++i) { nruns.push_back(k); - k = std::max(nrunsmin, (index_t)(k/nrunsfactor)); + k = math::max(nrunsmin, (index_t)(k/nrunsfactor)); } } - if (nruns.size() a = {0,3};//,4,7,8,5}; - std::vector b = {32,16};//,8,4,2,1}; - - // Benchmark: visitor-based Poisson 2d assembler + // Benchmark: visitor-based Poisson 2d assembler with increasing number of patches create_test > - ("assemblerVisitor", util::zip(b, - //make_vector(index_t(4), asizes.size()), - a, - make_vector(index_t(5), a.size())), - nruns, nthreads, benchmark); + ("assemblerVisitor", util::zip(patches, + make_vector(index_t(1), patches.size()), + make_vector(index_t(3), patches.size())), + nruns, nthreads, benchmark, " with increasing number of patches"); break; } - + case (10): { - // Benchmark: visitor-based Poisson 3d assembler + // Benchmark: visitor-based Poisson 2d assembler with increasing number of subdivisions + create_test > + ("assemblerVisitor", util::zip(make_vector(index_t(4), subdivides.size()), + subdivides, + make_vector(index_t(3), subdivides.size())), + nruns, nthreads, benchmark, " with increasing number of subdivisions"); + break; + } + + case (11): { + // Benchmark: visitor-based Poisson 3d assembler with increasing number of patches create_test > - ("assemblerVisitor", vsizes, nruns, nthreads, benchmark); + ("assemblerVisitor", util::zip(patches, + make_vector(index_t(0), patches.size()), + make_vector(index_t(1), patches.size())), + nruns, nthreads, benchmark, " with increasing number of patches"); break; } + + case (12): { + // Benchmark: visitor-based Poisson 3d assembler with increasing number of subdivisions + create_test > + ("assemblerVisitor", util::zip(make_vector(index_t(1), subdivides.size()), + subdivides, + make_vector(index_t(2), subdivides.size())), + nruns, nthreads, benchmark, " with increasing number of subdivisions"); + break; + } + + case (13): { + // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches + create_test > + ("assemblerExpressionAssembler", util::zip(patches, + make_vector(index_t(1), patches.size()), + make_vector(index_t(3), patches.size())), + nruns, nthreads, benchmark, " with increasing number of patches"); + break; + } + + case (14): { + // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of subdivision + create_test > + ("assemblerExpressionAssembler", util::zip(make_vector(index_t(4), subdivides.size()), + subdivides, + make_vector(index_t(3), subdivides.size())), + nruns, nthreads, benchmark, " with increasing number of subdivisions"); + break; + } + + default: GISMO_ERROR("Invalid benchmark"); } From 1dc64b48cc5e10452393d19b2a031136b385158e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 19 Dec 2021 09:44:40 +0100 Subject: [PATCH 103/174] fixed small bug --- src/gsParallel/gsOpenMP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h index 8f73d7a671..447adff76b 100644 --- a/src/gsParallel/gsOpenMP.h +++ b/src/gsParallel/gsOpenMP.h @@ -15,7 +15,7 @@ #ifdef _OPENMP -#ifcc _OPENMP >= 202111 +#ifdef _OPENMP >= 202111 #define GISMO_HAS_OPENMP52 #elif _OPENMP >= 202011 #define GISMO_HAS_OPENMP51 From 591661d25614cbd4a023b1e5bf15630d3087c373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 20 Dec 2021 11:32:20 +0100 Subject: [PATCH 104/174] small fixes with GISMO_HAS_OPENMPxy --- src/gsParallel/gsOpenMP.h | 71 ++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h index 447adff76b..d36f5bca87 100644 --- a/src/gsParallel/gsOpenMP.h +++ b/src/gsParallel/gsOpenMP.h @@ -15,28 +15,67 @@ #ifdef _OPENMP -#ifdef _OPENMP >= 202111 -#define GISMO_HAS_OPENMP52 -#elif _OPENMP >= 202011 -#define GISMO_HAS_OPENMP51 -#elif _OPENMP >= 201811 -#define GISMO_HAS_OPENMP50 -#elif _OPENMP >= 201511 -#define GISMO_HAS_OPENMP45 -#elif _OPENMP >= 201307 -#define GISMO_HAS_OPENMP40 -#elif _OPENMP >= 201107 -#define GISMO_HAS_OPENMP31 -#elif _OPENMP >= 200805 -#define GISMO_HAS_OPENMP30 -#elif _OPENMP >= 200505 -#define GISMO_HAS_OPENMP25 +#if _OPENMP >= 202111 +#define GISMO_HAS_OPENMP_52 1 +#else +#define GISMO_HAS_OPENMP_52 0 +#endif + +#if _OPENMP >= 202011 +#define GISMO_HAS_OPENMP_51 1 +#else +#define GISMO_HAS_OPENMP_51 0 +#endif + +#if _OPENMP >= 201811 +#define GISMO_HAS_OPENMP_50 1 +#else +#define GISMO_HAS_OPENMP_50 0 +#endif + +#if _OPENMP >= 201511 +#define GISMO_HAS_OPENMP_45 1 +#else +#define GISMO_HAS_OPENMP_45 0 +#endif + +#if _OPENMP >= 201307 +#define GISMO_HAS_OPENMP_40 1 +#else +#define GISMO_HAS_OPENMP_40 0 +#endif + +#if _OPENMP >= 201107 +#define GISMO_HAS_OPENMP_31 1 +#else +#define GISMO_HAS_OPENMP_31 0 +#endif + +#if _OPENMP >= 200805 +#define GISMO_HAS_OPENMP_30 1 +#else +#define GISMO_HAS_OPENMP_30 0 +#endif + +#if _OPENMP >= 200505 +#define GISMO_HAS_OPENMP_25 1 +#else +#define GISMO_HAS_OPENMP_25 0 #endif #include #else +#define GISMO_HAS_OPENMP_52 0 +#define GISMO_HAS_OPENMP_51 0 +#define GISMO_HAS_OPENMP_50 0 +#define GISMO_HAS_OPENMP_45 0 +#define GISMO_HAS_OPENMP_40 0 +#define GISMO_HAS_OPENMP_31 0 +#define GISMO_HAS_OPENMP_30 0 +#define GISMO_HAS_OPENMP_25 0 + #include void GISMO_EXPORT omp_set_num_threads(int num_threads); From c1f6f744ff444b924dc41c5fb9a77e8333891fb7 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 20 Dec 2021 11:35:46 +0100 Subject: [PATCH 105/174] small fixes --- examples/performance_benchmark.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 590713a5b4..02d99f085b 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -1061,8 +1061,8 @@ int main(int argc, char *argv[]) // Benchmark: visitor-based Poisson 2d assembler with increasing number of patches create_test > ("assemblerVisitor", util::zip(patches, - make_vector(index_t(1), patches.size()), - make_vector(index_t(3), patches.size())), + make_vector((index_t)1, patches.size()), + make_vector((index_t)3, patches.size())), nruns, nthreads, benchmark, " with increasing number of patches"); break; } @@ -1070,9 +1070,9 @@ int main(int argc, char *argv[]) case (10): { // Benchmark: visitor-based Poisson 2d assembler with increasing number of subdivisions create_test > - ("assemblerVisitor", util::zip(make_vector(index_t(4), subdivides.size()), + ("assemblerVisitor", util::zip(make_vector((index_t)4, subdivides.size()), subdivides, - make_vector(index_t(3), subdivides.size())), + make_vector((index_t)3, subdivides.size())), nruns, nthreads, benchmark, " with increasing number of subdivisions"); break; } @@ -1081,8 +1081,8 @@ int main(int argc, char *argv[]) // Benchmark: visitor-based Poisson 3d assembler with increasing number of patches create_test > ("assemblerVisitor", util::zip(patches, - make_vector(index_t(0), patches.size()), - make_vector(index_t(1), patches.size())), + make_vector((index_t)0, patches.size()), + make_vector((index_t)1, patches.size())), nruns, nthreads, benchmark, " with increasing number of patches"); break; } @@ -1090,9 +1090,9 @@ int main(int argc, char *argv[]) case (12): { // Benchmark: visitor-based Poisson 3d assembler with increasing number of subdivisions create_test > - ("assemblerVisitor", util::zip(make_vector(index_t(1), subdivides.size()), + ("assemblerVisitor", util::zip(make_vector((index_t)1, subdivides.size()), subdivides, - make_vector(index_t(2), subdivides.size())), + make_vector((index_t)2, subdivides.size())), nruns, nthreads, benchmark, " with increasing number of subdivisions"); break; } @@ -1101,8 +1101,8 @@ int main(int argc, char *argv[]) // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches create_test > ("assemblerExpressionAssembler", util::zip(patches, - make_vector(index_t(1), patches.size()), - make_vector(index_t(3), patches.size())), + make_vector((index_t)1, patches.size()), + make_vector((index_t)3, patches.size())), nruns, nthreads, benchmark, " with increasing number of patches"); break; } @@ -1110,9 +1110,9 @@ int main(int argc, char *argv[]) case (14): { // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of subdivision create_test > - ("assemblerExpressionAssembler", util::zip(make_vector(index_t(4), subdivides.size()), + ("assemblerExpressionAssembler", util::zip(make_vector((index_t)4, subdivides.size()), subdivides, - make_vector(index_t(3), subdivides.size())), + make_vector((index_t)3, subdivides.size())), nruns, nthreads, benchmark, " with increasing number of subdivisions"); break; } From 7c97ce17264d42722ff01deddcfc8bf2fe65daf9 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 21 Dec 2021 08:22:10 -0500 Subject: [PATCH 106/174] small bug fix --- src/gsCore/gsSysInfo.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index 42e0f26d9c..d3b188ecf3 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -654,7 +654,11 @@ namespace gismo char hostname[HOST_NAME_MAX + 1]; gethostname(hostname, HOST_NAME_MAX + 1); - return "Unknown-CPU ["+hostname+"]"; + std::string str = "Unknown-CPU ["; + str += hostname; + str += "]"; + + return str; # endif From 814d9adee34fac73dc75e1b0d7099af22174524f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 21 Dec 2021 14:24:37 +0100 Subject: [PATCH 107/174] Added support for Cray and Fujitsu --- external/Eigen/src/Core/util/Macros.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/external/Eigen/src/Core/util/Macros.h b/external/Eigen/src/Core/util/Macros.h index 986c3d44db..051cd64f9c 100644 --- a/external/Eigen/src/Core/util/Macros.h +++ b/external/Eigen/src/Core/util/Macros.h @@ -193,9 +193,23 @@ #define EIGEN_COMP_EMSCRIPTEN 0 #endif +/// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler +#if defined(__FUJITSU) + #define EIGEN_COMP_FCC (__FCC_major__*100+__FCC_minor__*10+__FCC_patchlevel__) +#else + #define EIGEN_COMP_FCC 0 +#endif + +/// \internal EIGEN_COMP_CRAY set to CARY version if the compiler is Cray Compiler +#if defined(__CRAYC) + #define EIGEN_COMP_CRAY (_RELEASE_MAJOR*100+_RELEASE_MINOR*10+_RELEASE_PATCHLEVEL) +#else + #define EIGEN_COMP_CRAY 0 +#endif + /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 From 82c2c00bccfd41bca1860cf22e301b9ced02370e Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 21 Dec 2021 08:26:03 -0500 Subject: [PATCH 108/174] Fixed bugs in Fujitsu support --- src/gsCore/gsDebug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsCore/gsDebug.h b/src/gsCore/gsDebug.h index 93048a72ec..38e65778da 100644 --- a/src/gsCore/gsDebug.h +++ b/src/gsCore/gsDebug.h @@ -234,7 +234,7 @@ static const int gismo_set_abort_behavior = _set_abort_behavior( #pragma clang diagnostic ignored "-Wconstant-logical-operand" #pragma clang diagnostic ignored "-Wbind-to-temporary-copy" -#elif defined __GNUC__ // major version >=4 +#elif defined __GNUC__ && !defined __FUJITSU // major version >=4 // typedef locally defined but not used [-Wunused-local-typedefs] #if ( __GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>7) ) #pragma GCC diagnostic ignored "-Wunused-local-typedefs" From 22aaf99cd4d5ed406a8a08143e672b73812057de Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 21 Dec 2021 08:35:57 -0500 Subject: [PATCH 109/174] Fixed small bugs in Fujitsu support --- external/Eigen/src/Core/util/DisableStupidWarnings.h | 2 +- external/Eigen/src/Core/util/ReenableStupidWarnings.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/Eigen/src/Core/util/DisableStupidWarnings.h b/external/Eigen/src/Core/util/DisableStupidWarnings.h index fe0cfec0bc..0bf08bd8b5 100755 --- a/external/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/external/Eigen/src/Core/util/DisableStupidWarnings.h @@ -53,7 +53,7 @@ #pragma clang diagnostic ignored "-Wc11-extensions" #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined __FUJITSU #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push diff --git a/external/Eigen/src/Core/util/ReenableStupidWarnings.h b/external/Eigen/src/Core/util/ReenableStupidWarnings.h index 1ce6fd1b00..8e02ba4866 100644 --- a/external/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/external/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -12,7 +12,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) + #elif defined __GNUC__ && !defined __FUJITSU && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif From e5558024f59440e6015768a16737636bdf889568 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 21 Dec 2021 11:05:51 -0500 Subject: [PATCH 110/174] Fixed bug in Cray support --- external/Eigen/src/Core/arch/NEON/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/Eigen/src/Core/arch/NEON/Complex.h b/external/Eigen/src/Core/arch/NEON/Complex.h index f40af7f87f..8439b4af6c 100644 --- a/external/Eigen/src/Core/arch/NEON/Complex.h +++ b/external/Eigen/src/Core/arch/NEON/Complex.h @@ -390,7 +390,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CRAY || EIGEN_COMP_CASTXML static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; From f4949555bf73eea5885016192e2fa166794ec8e6 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 21 Dec 2021 11:49:57 -0500 Subject: [PATCH 111/174] Fixed bug in Cray support --- external/Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/Eigen/src/Core/util/Macros.h b/external/Eigen/src/Core/util/Macros.h index 051cd64f9c..385a497064 100644 --- a/external/Eigen/src/Core/util/Macros.h +++ b/external/Eigen/src/Core/util/Macros.h @@ -201,7 +201,7 @@ #endif /// \internal EIGEN_COMP_CRAY set to CARY version if the compiler is Cray Compiler -#if defined(__CRAYC) +#if defined(_CRAYC) #define EIGEN_COMP_CRAY (_RELEASE_MAJOR*100+_RELEASE_MINOR*10+_RELEASE_PATCHLEVEL) #else #define EIGEN_COMP_CRAY 0 @@ -946,7 +946,7 @@ #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE #endif -#if EIGEN_COMP_GNUC +#if EIGEN_COMP_GNUC && !defined(EIGEN_COMP_CRAY) #define EIGEN_DONT_INLINE __attribute__((noinline)) #elif EIGEN_COMP_MSVC #define EIGEN_DONT_INLINE __declspec(noinline) From 6047e7780662b0b35b9c4d16f07c23fc534072ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 26 Dec 2021 10:09:27 +0100 Subject: [PATCH 112/174] Made private classes public --- src/gsIO/gsBenchmark.cpp | 68 +++++++----- src/gsIO/gsBenchmark.h | 224 +++++++++++++++++++++------------------ 2 files changed, 159 insertions(+), 133 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 78cd715297..291690ccb5 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -19,8 +19,7 @@ namespace gismo { - - std::ostream &gsBenchmark::gsBenchmarkResultSet::print(std::ostream &os) const + std::ostream &gsBenchmarkResultSet::to_tikz(std::ostream &os) const { os << "\\pgfplotstableread[col sep=space]{\n" << label << "\n"; @@ -33,10 +32,10 @@ namespace gismo return os; } - std::ostream &gsBenchmark::gsBenchmarkSet::print(std::ostream &os) const + std::ostream &gsBenchmarkSet::to_tikz(std::ostream &os) const { for (auto it=results.cbegin(); it!=results.cend(); ++it) - (*it)->print(os); + (*it)->to_tikz(os); os << "\\begin{tikzpicture}\n" << "\\begin{axis}[\n" @@ -57,53 +56,53 @@ namespace gismo << "xtick=data,\n"; auto it = results.front()->get().cbegin(); - if ((metric)it->at(3) & metric::speedup) { - switch( (int)it->at(3) & ~metric::speedup ) { - case metric::bandwidth_kb_sec: - case metric::bandwidth_mb_sec: - case metric::bandwidth_gb_sec: - case metric::bandwidth_tb_sec: + if ((metric)it->at(3) & gismo::metric::speedup) { + switch( (int)it->at(3) & ~gismo::metric::speedup ) { + case gismo::metric::bandwidth_kb_sec: + case gismo::metric::bandwidth_mb_sec: + case gismo::metric::bandwidth_gb_sec: + case gismo::metric::bandwidth_tb_sec: os << "ylabel={Bandwidth [speedup]},\n"; break; - case metric::perf_kflop_sec: - case metric::perf_mflop_sec: - case metric::perf_gflop_sec: - case metric::perf_tflop_sec: + case gismo::metric::perf_kflop_sec: + case gismo::metric::perf_mflop_sec: + case gismo::metric::perf_gflop_sec: + case gismo::metric::perf_tflop_sec: os << "ylabel={Performance [speedup]},\n"; break; - case metric::runtime_sec: + case gismo::metric::runtime_sec: os << "ylabel={Runtime [speedup]},\n"; break; default: GISMO_ERROR("Unsupported metric"); } } else { - switch( (int)it->at(3) & ~metric::speedup ) { - case metric::bandwidth_kb_sec: + switch( (int)it->at(3) & ~gismo::metric::speedup ) { + case gismo::metric::bandwidth_kb_sec: os << "ylabel={Bandwidth in KB/s},\n"; break; - case metric::bandwidth_mb_sec: + case gismo::metric::bandwidth_mb_sec: os << "ylabel={Bandwidth in MB/s},\n"; break; - case metric::bandwidth_gb_sec: + case gismo::metric::bandwidth_gb_sec: os << "ylabel={Bandwidth in GB/s},\n"; break; - case metric::bandwidth_tb_sec: + case gismo::metric::bandwidth_tb_sec: os << "ylabel={Bandwidth in TB/s},\n"; break; - case metric::perf_kflop_sec: + case gismo::metric::perf_kflop_sec: os << "ylabel={Performance in kFLOP/s},\n"; break; - case metric::perf_mflop_sec: + case gismo::metric::perf_mflop_sec: os << "ylabel={Performance in mFLOP/s},\n"; break; - case metric::perf_gflop_sec: + case gismo::metric::perf_gflop_sec: os << "ylabel={Performance in gFLOP/s},\n"; break; - case metric::perf_tflop_sec: + case gismo::metric::perf_tflop_sec: os << "ylabel={Performance in tFLOP/s},\n"; break; - case metric::runtime_sec: + case gismo::metric::runtime_sec: os << "ylabel={Runtime in seconds},\n"; break; default: @@ -181,7 +180,7 @@ namespace gismo return os; } - std::ostream &gsBenchmark::print(std::ostream &os) const + std::ostream &gsBenchmark::to_tikz(std::ostream &os) const { os << "\\documentclass[tikz]{standalone}\n" << "\\usepackage{pgfplots}\n" @@ -199,10 +198,25 @@ namespace gismo << "\\usetikzlibrary{calc}\n"; for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) - (*it)->print(os); + (*it)->to_tikz(os); os << "\\end{document}\n"; return os; } + std::ostream &gsBenchmarkResultSet::print(std::ostream &os) const + { + return os; + } + + std::ostream &gsBenchmarkSet::print(std::ostream &os) const + { + return os; + } + + std::ostream &gsBenchmark::print(std::ostream &os) const + { + return os; + } + } // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index d0ba3a655a..4b8bc2a533 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -18,115 +18,126 @@ namespace gismo { - /** - * Benchmark metrics - */ - enum metric { - speedup = 0x1, - bandwidth_kb_sec = 10, - bandwidth_mb_sec = 11, - bandwidth_gb_sec = 12, - bandwidth_tb_sec = 13, - perf_kflop_sec = 14, - perf_mflop_sec = 15, - perf_gflop_sec = 16, - perf_tflop_sec = 17, - runtime_sec = 18 - }; - + @brief Enumerator that defines the benchmark metrics. + + These definitions are used to control the output of the benchmark framework +*/ +enum metric : uint64_t { + speedup = 1<<0, + bandwidth_kb_sec = 1<<1, + bandwidth_mb_sec = 1<<2, + bandwidth_gb_sec = 1<<3, + bandwidth_tb_sec = 1<<4, + perf_kflop_sec = 1<<5, + perf_mflop_sec = 1<<6, + perf_gflop_sec = 1<<7, + perf_tflop_sec = 1<<8, + runtime_sec = 1<<9 +}; + /** - * Benchmark: driver function - */ - + @brief Struct that represents a single benchmark result +*/ +class Result1 +{ +public: + int threads; + gismo::metric metric; + double value; + double runtime; +}; + typedef std::array Result; /** - * Benchmark class - */ -class GISMO_EXPORT gsBenchmark + @brief Struct that represents a collection of benchmark results for + a single benchmark instance + + This struct can be used to hold a series of results of a single + benchmark instance (i.e. fixed problem size and problem + configuration) for different numbers of threads. +*/ +class gsBenchmarkResultSet { public: + gsBenchmarkResultSet(const std::string& label, + const std::string& title, + const std::vector& results) + : label(label), + title(title), + results(results) {} + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector& get() const + { return results; } + + std::ostream &to_tikz(std::ostream &os) const; + std::ostream &print(std::ostream &os) const; + +private: + const std::string label, title; + std::vector results; +}; /** - * Benchmark result - */ -typedef std::array Result; + @brief Struct that represents a collection of benchmark sets for a + series of benchmark instance -/** - * Benchmark result set class - */ - class gsBenchmarkResultSet + This struct can be used to hold a series of benchmark instances + (i.e. a series of problem sizes and configurations) +*/ +class gsBenchmarkSet +{ +public: + gsBenchmarkSet(const std::string& _label, + const std::string& _title) + : id('A'), + label(_label), + title(_title) + {} + + ~gsBenchmarkSet() { - public: - gsBenchmarkResultSet(const std::string& label, - const std::string& title, - const std::vector& results) - : label(label), - title(title), - results(results) - { - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector& get() const - { return results; } - - std::ostream &print(std::ostream &os) const; - - private: - const std::string label, title; - std::vector results; - }; - - /** - * Benchmark set class - */ - class gsBenchmarkSet + for (auto it=results.begin(); it!=results.end(); ++it) + delete (*it); + } + + void add(const std::string& _label, + const std::string& _title, + const std::vector& _results) { - public: - gsBenchmarkSet(const std::string& _label, - const std::string& _title) - : id('A'), - label(_label), - title(_title) - {} - - ~gsBenchmarkSet() - { - for (auto it=results.begin(); it!=results.end(); ++it) - delete (*it); - } - - void add(const std::string& _label, - const std::string& _title, - const std::vector& _results) - { - this->results.emplace_back(new gsBenchmarkResultSet(_label+std::string(1,id++), - _title, _results)); - } - - const std::string& get_label() const - { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector& get() const - { return results; } - - std::ostream &print(std::ostream &os) const; - - private: - char id; - const std::string label,title; - std::vector results; - }; - + this->results.emplace_back(new gsBenchmarkResultSet(_label+std::string(1,id++), + _title, _results)); + } + + const std::string& get_label() const + { return label; } + + const std::string& get_title() const + { return title; } + + const std::vector& get() const + { return results; } + + std::ostream &to_tikz(std::ostream &os) const; + std::ostream &print(std::ostream &os) const; + +private: + char id; + const std::string label,title; + std::vector results; +}; + +/** + @brief Class that collects all benchmark results + */ +class GISMO_EXPORT gsBenchmark +{ public: ~gsBenchmark() { @@ -144,11 +155,12 @@ typedef std::array Result; const std::vector& get() const { return benchmarks; } + std::ostream &to_tikz(std::ostream &os) const; std::ostream &print(std::ostream &os) const; template static std::vector - run(const std::vector& nthreads, index_t nruns, T& benchmark, metric metric) + run(const std::vector& nthreads, index_t nruns, T& benchmark, gismo::metric metric) { gsStopwatch stopwatch; uint64_t benchmark_result(0); @@ -171,20 +183,20 @@ typedef std::array Result; stopwatch.stop(); benchmark_runtime = stopwatch.elapsed()/(real_t)nruns; - switch(metric & ~metric::speedup) { - case metric::bandwidth_kb_sec: case metric::perf_kflop_sec: + switch(metric & ~gismo::metric::speedup) { + case gismo::metric::bandwidth_kb_sec: case gismo::metric::perf_kflop_sec: benchmark_metric = 1e-3*benchmark_result/benchmark_runtime; break; - case metric::bandwidth_mb_sec: case metric::perf_mflop_sec: + case gismo::metric::bandwidth_mb_sec: case gismo::metric::perf_mflop_sec: benchmark_metric = 1e-6*benchmark_result/benchmark_runtime; break; - case metric::bandwidth_gb_sec: case metric::perf_gflop_sec: + case gismo::metric::bandwidth_gb_sec: case gismo::metric::perf_gflop_sec: benchmark_metric = 1e-9*benchmark_result/benchmark_runtime; break; - case metric::bandwidth_tb_sec: case metric::perf_tflop_sec: + case gismo::metric::bandwidth_tb_sec: case gismo::metric::perf_tflop_sec: benchmark_metric = 1e-12*benchmark_result/benchmark_runtime; break; - case metric::runtime_sec: + case gismo::metric::runtime_sec: benchmark_metric = benchmark_runtime; break; default: @@ -201,7 +213,7 @@ typedef std::array Result; } catch(...) {} // Convert to relative values (speedup relative to first entry) - if (metric & metric::speedup) { + if (metric & gismo::metric::speedup) { benchmark_runtime = results.front().at(1); benchmark_metric = results.front().at(2); From 40b1e06c4e877f80d57da6fbaa1eea351626ad60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 26 Dec 2021 10:09:49 +0100 Subject: [PATCH 113/174] small fixes --- examples/performance_benchmark.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 02d99f085b..24973952e9 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -154,7 +154,7 @@ class benchmark_c_array_memcopy static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark native C array memcopy] @@ -220,7 +220,7 @@ class benchmark_c_array_dotproduct static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark native C array dot-product] @@ -287,7 +287,7 @@ class benchmark_c_array_axpy static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark native C array AXPY] @@ -360,7 +360,7 @@ class benchmark_c_array_dense_matmul static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark native C array dense matrix-vector multiplication] @@ -412,7 +412,7 @@ class benchmark_eigen_memcopy static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark eigen vector memcopy] @@ -462,7 +462,7 @@ class benchmark_eigen_dotproduct static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark eigen vector dot-product] @@ -515,7 +515,7 @@ class benchmark_eigen_axpy static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark eigen vector AXPY] @@ -569,7 +569,7 @@ class benchmark_eigen_dense_matmul static constexpr gismo::metric metric() { - return metric::bandwidth_gb_sec; + return gismo::metric::bandwidth_gb_sec; } }; //! [Implement benchmark eigen dense matrix-vector multiplication] @@ -649,7 +649,7 @@ class benchmark_poisson2d_visitor static constexpr gismo::metric metric() { - return (gismo::metric)(metric::runtime_sec + 0*metric::speedup); + return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); } }; //! [Implement benchmark Poisson 2d visitor] @@ -729,7 +729,7 @@ class benchmark_poisson3d_visitor static constexpr gismo::metric metric() { - return (gismo::metric)(metric::runtime_sec + metric::speedup); + return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); } }; //! [Implement benchmark Poisson 3d visitor] @@ -839,7 +839,7 @@ class benchmark_poisson2d_expression_assembler static constexpr gismo::metric metric() { - return (gismo::metric)(metric::runtime_sec + 0*metric::speedup); + return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); } }; //! [Implement benchmark Poisson 2d expression assembler] @@ -1129,7 +1129,7 @@ int main(int argc, char *argv[]) else { std::ofstream file; file.open(fn); - file << benchmark << "\n"; + benchmark.to_tikz(file); file.close(); } //! [Execute benchmarks] From 8e7c62449904774e0f7d371507562635098720e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 28 Dec 2021 09:28:29 +0100 Subject: [PATCH 114/174] Small fix for Fujitsu compiler in LLVM mode --- external/Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/Eigen/src/Core/util/Macros.h b/external/Eigen/src/Core/util/Macros.h index 385a497064..556acb8cbb 100644 --- a/external/Eigen/src/Core/util/Macros.h +++ b/external/Eigen/src/Core/util/Macros.h @@ -194,7 +194,7 @@ #endif /// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler -#if defined(__FUJITSU) +#if defined(__FUJITSU) || defined(__CLANG_FUJITSU) #define EIGEN_COMP_FCC (__FCC_major__*100+__FCC_minor__*10+__FCC_patchlevel__) #else #define EIGEN_COMP_FCC 0 @@ -209,7 +209,7 @@ /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMO_CRAY) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 From 25bb3d6903cd0d1e57040fb72094ae2dca01ed2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 28 Dec 2021 09:28:53 +0100 Subject: [PATCH 115/174] Small fixes in benchmark --- src/gsIO/gsBenchmark.cpp | 27 +++++++++++++++++------ src/gsIO/gsBenchmark.h | 46 ++++++++++++++++++++-------------------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 291690ccb5..5653217886 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -25,7 +25,7 @@ namespace gismo << label << "\n"; for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << it->at(2) << "\n"; + os << it->value << "\n"; os << "}\\data" << label << "\n"; @@ -43,8 +43,8 @@ namespace gismo << "width=2\\textwidth,\n" << "height=.8\\textwidth,\n" << "legend pos=outer north east,\n" - << "ybar = 0.05cm,\n" - << "bar width = 3pt,\n" + << "ybar=0.05cm,\n" + << "bar width=3pt,\n" << "ymajorgrids=true,\n" << "xticklabel style={rotate=45,anchor=east},\n" << "xticklabels={"; @@ -56,8 +56,8 @@ namespace gismo << "xtick=data,\n"; auto it = results.front()->get().cbegin(); - if ((metric)it->at(3) & gismo::metric::speedup) { - switch( (int)it->at(3) & ~gismo::metric::speedup ) { + if (it->metric & gismo::metric::speedup) { + switch(it->metric & ~gismo::metric::speedup) { case gismo::metric::bandwidth_kb_sec: case gismo::metric::bandwidth_mb_sec: case gismo::metric::bandwidth_gb_sec: @@ -77,7 +77,7 @@ namespace gismo GISMO_ERROR("Unsupported metric"); } } else { - switch( (int)it->at(3) & ~gismo::metric::speedup ) { + switch(it->metric & ~gismo::metric::speedup) { case gismo::metric::bandwidth_kb_sec: os << "ylabel={Bandwidth in KB/s},\n"; break; @@ -139,7 +139,7 @@ namespace gismo it = results.front()->get().cbegin(); auto ite = results.front()->get().cend(); for (;it!=ite; ++it) - os << "Threads=" << it->at(0) << (it!=ite-1 ? "," : ""); + os << "Threads=" << it->threads << (it!=ite-1 ? "," : ""); os << "}\n" << "\\end{axis}\n" @@ -206,16 +206,29 @@ namespace gismo std::ostream &gsBenchmarkResultSet::print(std::ostream &os) const { + os << "... " << std::setw(6) << title << " : "; + for (auto it=results.cbegin(); it!=results.cend(); ++it) + os << std::setw(4) << it->threads << " : " + << std::setw(6) << std::scientific << std::setprecision(2) << it->value; + os << "\n"; return os; } std::ostream &gsBenchmarkSet::print(std::ostream &os) const { + os << "=== " << title << "\n" + << std::setw(10) << "size" + << std::setw(7) << "omp" + << std::setw(12) << "bw\n"; + for (auto it=results.cbegin(); it!=results.cend(); ++it) + (*it)->print(os); return os; } std::ostream &gsBenchmark::print(std::ostream &os) const { + for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) + (*it)->print(os); return os; } diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 4b8bc2a533..82faff0900 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -39,15 +39,15 @@ enum metric : uint64_t { /** @brief Struct that represents a single benchmark result */ -class Result1 +class Result { public: - int threads; + int threads; + double runtime; + double value; gismo::metric metric; - double value; - double runtime; }; - typedef std::array Result; + //typedef std::array Result; /** @brief Struct that represents a collection of benchmark results for @@ -163,8 +163,8 @@ class GISMO_EXPORT gsBenchmark run(const std::vector& nthreads, index_t nruns, T& benchmark, gismo::metric metric) { gsStopwatch stopwatch; - uint64_t benchmark_result(0); - real_t benchmark_metric, benchmark_runtime; + uint64_t result(0); + real_t value, runtime; std::vector results; @@ -172,54 +172,54 @@ class GISMO_EXPORT gsBenchmark for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { omp_set_num_threads(*it); - benchmark_runtime = 0.0; + runtime = 0.0; stopwatch.restart(); for (index_t run=0; run(*it); // number of OpenMP threads - res[1]= benchmark_runtime; // averaged elapsed time in seconds - res[2]= benchmark_metric; // averaged benchmark metric - res[3]= (real_t)metric; // benchmark metric + res.threads = static_cast(*it); // number of OpenMP threads + res.runtime = runtime; // averaged elapsed time in seconds + res.value = value; // averaged benchmark value + res.metric = metric; // benchmark metric results.push_back( give(res) ); } } catch(...) {} // Convert to relative values (speedup relative to first entry) if (metric & gismo::metric::speedup) { - benchmark_runtime = results.front().at(1); - benchmark_metric = results.front().at(2); + runtime = results.front().runtime; + value = results.front().value; for (auto &it : results) { - it.at(1) = benchmark_runtime / it.at(1); - it.at(2) = benchmark_metric / it.at(2); + it.runtime = runtime / it.runtime; + it.value = value / it.value; } } From 94f73cfa3f4c7522e07e55b13b30b38115645c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 28 Dec 2021 09:39:12 +0100 Subject: [PATCH 116/174] Small fixes --- external/Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/Eigen/src/Core/util/Macros.h b/external/Eigen/src/Core/util/Macros.h index 556acb8cbb..b4c358ce94 100644 --- a/external/Eigen/src/Core/util/Macros.h +++ b/external/Eigen/src/Core/util/Macros.h @@ -209,7 +209,7 @@ /// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.) -#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMO_CRAY) +#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CRAY) #define EIGEN_COMP_GNUC_STRICT 1 #else #define EIGEN_COMP_GNUC_STRICT 0 From e74ca421b3541c594db390d1bfbcd71ad683da11 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 28 Dec 2021 03:42:42 -0500 Subject: [PATCH 117/174] Small fix for Fujitsu compiler in LLVM mode --- external/Eigen/src/Core/arch/NEON/Complex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/Eigen/src/Core/arch/NEON/Complex.h b/external/Eigen/src/Core/arch/NEON/Complex.h index 8439b4af6c..3d575590b8 100644 --- a/external/Eigen/src/Core/arch/NEON/Complex.h +++ b/external/Eigen/src/Core/arch/NEON/Complex.h @@ -18,7 +18,7 @@ namespace internal { inline uint32x4_t p4ui_CONJ_XOR() { // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CRAY || defined(__CLANG_FUJITSU) uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; return ret; #else @@ -390,7 +390,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CRAY || EIGEN_COMP_CASTXML +#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CRAY || defined(__CLANG_FUJITSU) static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; #else const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; From 1abb9324ab066a6e266a226e6f8adfc1bb060871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 31 Dec 2021 13:30:22 +0100 Subject: [PATCH 118/174] small fixes --- src/gsIO/gsBenchmark.cpp | 70 +++++++++-- src/gsIO/gsBenchmark.h | 243 ++++++++++++++++++++++++++++----------- 2 files changed, 240 insertions(+), 73 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 5653217886..2ba38a4ea5 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -50,7 +50,7 @@ namespace gismo << "xticklabels={"; for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) - os << (*rit)->get_title() << (rit!=results.cend()-1 ? "," : ""); + os << (*rit)->get_descr() << (rit!=results.cend()-1 ? "," : ""); os << "},\n" << "xtick=data,\n"; @@ -110,7 +110,7 @@ namespace gismo } } - os << "title={" << title + os << "title={" << descr << " [real\\_t:" << util::type::name() << ", index\\_t:" << util::type::name() << ", short\\_t:" << util::type::name()<< "]},\n" @@ -206,7 +206,7 @@ namespace gismo std::ostream &gsBenchmarkResultSet::print(std::ostream &os) const { - os << "... " << std::setw(6) << title << " : "; + os << std::setw(8) << descr << " | "; for (auto it=results.cbegin(); it!=results.cend(); ++it) os << std::setw(4) << it->threads << " : " << std::setw(6) << std::scientific << std::setprecision(2) << it->value; @@ -216,10 +216,66 @@ namespace gismo std::ostream &gsBenchmarkSet::print(std::ostream &os) const { - os << "=== " << title << "\n" - << std::setw(10) << "size" - << std::setw(7) << "omp" - << std::setw(12) << "bw\n"; + os << "[" << label << "] " << descr << "\n" + << std::setw(8) << "memsize" + << " | " + << util::to_string(results.front()->get().size()) + << "x (#Threads : "; + + if (results.front()->get().cbegin()->metric & gismo::metric::speedup) { + switch(results.front()->get().cbegin()->metric & ~gismo::metric::speedup) { + case gismo::metric::bandwidth_kb_sec: + case gismo::metric::bandwidth_mb_sec: + case gismo::metric::bandwidth_gb_sec: + case gismo::metric::bandwidth_tb_sec: + os << "Bandwidth [speedup])\n"; + break; + case gismo::metric::perf_kflop_sec: + case gismo::metric::perf_mflop_sec: + case gismo::metric::perf_gflop_sec: + case gismo::metric::perf_tflop_sec: + os << "Performance [speedup])\n"; + break; + case gismo::metric::runtime_sec: + os << "Runtime [speedup])\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } else { + switch(results.front()->get().cbegin()->metric & ~gismo::metric::speedup) { + case gismo::metric::bandwidth_kb_sec: + os << "Bandwidth in KB/s)\n"; + break; + case gismo::metric::bandwidth_mb_sec: + os << "Bandwidth in MB/s)\n"; + break; + case gismo::metric::bandwidth_gb_sec: + os << "Bandwidth in GB/s)\n"; + break; + case gismo::metric::bandwidth_tb_sec: + os << "Bandwidth in TB/s)\n"; + break; + case gismo::metric::perf_kflop_sec: + os << "Performance in kFLOP/s)\n"; + break; + case gismo::metric::perf_mflop_sec: + os << "Performance in mFLOP/s)\n"; + break; + case gismo::metric::perf_gflop_sec: + os << "Performance in gFLOP/s)\n"; + break; + case gismo::metric::perf_tflop_sec: + os << "Performance in tFLOP/s)\n"; + break; + case gismo::metric::runtime_sec: + os << "Runtime in seconds)\n"; + break; + default: + GISMO_ERROR("Unsupported metric"); + } + } + for (auto it=results.cbegin(); it!=results.cend(); ++it) (*it)->print(os); return os; diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 82faff0900..de26f8168f 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -24,22 +24,22 @@ namespace gismo These definitions are used to control the output of the benchmark framework */ enum metric : uint64_t { - speedup = 1<<0, - bandwidth_kb_sec = 1<<1, - bandwidth_mb_sec = 1<<2, - bandwidth_gb_sec = 1<<3, - bandwidth_tb_sec = 1<<4, - perf_kflop_sec = 1<<5, - perf_mflop_sec = 1<<6, - perf_gflop_sec = 1<<7, - perf_tflop_sec = 1<<8, - runtime_sec = 1<<9 + speedup = 1 << 0, + bandwidth_kb_sec = 1 << 1, + bandwidth_mb_sec = 1 << 2, + bandwidth_gb_sec = 1 << 3, + bandwidth_tb_sec = 1 << 4, + perf_kflop_sec = 1 << 5, + perf_mflop_sec = 1 << 6, + perf_gflop_sec = 1 << 7, + perf_tflop_sec = 1 << 8, + runtime_sec = 1 << 9 }; /** @brief Struct that represents a single benchmark result */ -class Result +class gsBenchmarkResult { public: int threads; @@ -47,7 +47,6 @@ class Result double value; gismo::metric metric; }; - //typedef std::array Result; /** @brief Struct that represents a collection of benchmark results for @@ -60,33 +59,40 @@ class Result class gsBenchmarkResultSet { public: + /// \brief Constructor gsBenchmarkResultSet(const std::string& label, - const std::string& title, - const std::vector& results) + const std::string& descr, + const std::vector& results) : label(label), - title(title), + descr(descr), results(results) {} - + + /// \brief Returns the label const std::string& get_label() const { return label; } - - const std::string& get_title() const - { return title; } - - const std::vector& get() const + + /// \brief Returns the descr + const std::string& get_descr() const + { return descr; } + + /// \brief Returns constant reference to the results + const std::vector& get() const { return results; } + /// \brief Serializes the content to LaTeX TIKZ std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content std::ostream &print(std::ostream &os) const; private: - const std::string label, title; - std::vector results; + const std::string label, descr; + std::vector results; }; /** @brief Struct that represents a collection of benchmark sets for a - series of benchmark instance + series of benchmark instances This struct can be used to hold a series of benchmark instances (i.e. a series of problem sizes and configurations) @@ -94,42 +100,51 @@ class gsBenchmarkResultSet class gsBenchmarkSet { public: + /// \brief Constructor gsBenchmarkSet(const std::string& _label, - const std::string& _title) + const std::string& _descr) : id('A'), label(_label), - title(_title) + descr(_descr) {} - + + /// \brief Destructor ~gsBenchmarkSet() { for (auto it=results.begin(); it!=results.end(); ++it) delete (*it); } - + + /// \brief Adds a benchmark to the benchmark set void add(const std::string& _label, - const std::string& _title, - const std::vector& _results) + const std::string& _descr, + const std::vector& _results) { this->results.emplace_back(new gsBenchmarkResultSet(_label+std::string(1,id++), - _title, _results)); + _descr, _results)); } - + + /// \brief Returns the label const std::string& get_label() const { return label; } - - const std::string& get_title() const - { return title; } - + + /// \brief Returns the descr + const std::string& get_descr() const + { return descr; } + + /// \brief Returns constant reference to the result sets const std::vector& get() const { return results; } + /// \brief Serializes the content to LaTeX TIKZ std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content std::ostream &print(std::ostream &os) const; private: char id; - const std::string label,title; + const std::string label,descr; std::vector results; }; @@ -139,62 +154,158 @@ class gsBenchmarkSet class GISMO_EXPORT gsBenchmark { public: + /// \brief Destructor ~gsBenchmark() - { - for (auto it=benchmarks.begin(); it!=benchmarks.end(); ++it) - delete (*it); - } - - gsBenchmarkSet* add(const std::string& _label, - const std::string& _title) { - benchmarks.emplace_back(new gsBenchmarkSet(_label, _title)); - return benchmarks.back(); + for (auto&& it : benchmarks) + delete (it); } + /// \brief Returns constant reference to the benchmarks const std::vector& get() const { return benchmarks; } + /// \brief Serializes the content to LaTeX TIKZ std::ostream &to_tikz(std::ostream &os) const; + + /// \brief Pretty-prints the content std::ostream &print(std::ostream &os) const; - template - static std::vector - run(const std::vector& nthreads, index_t nruns, T& benchmark, gismo::metric metric) -{ + /// \brief Returns iterator to benchmark set + const gsBenchmarkSet* find(const std::string& label) const + { + for (const auto& it : benchmarks) + if (it->get_label() == label) + return it; + return nullptr; + } + + /// \brief Creates a new benchmark set, adds it to the benchmark and + /// returns a pointer to the benchmark set to the calling routine + gsBenchmarkSet* create(const std::string& _label, + const std::string& _descr) + { + benchmarks.emplace_back(new gsBenchmarkSet(_label, _descr)); + return benchmarks.back(); + } + + /// \brief Creates a new benchmark set, adds it to the benchmark and + /// returns a pointer to the benchmark set to the calling routine + template + gsBenchmarkSet* create(const Iterator & sizes, + const std::vector & runs, + const std::vector & threads, + const std::string & extra_name="") + { + GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); + + auto benchmark = create(Test::label(), Test::name()+extra_name); + gsInfo << "[" << benchmark->get_label() << "] " << benchmark->get_descr() << "\n"; + + auto riter = runs.begin(); + for (auto it : sizes) { + gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; + try { + Test test(it); + auto results = gsBenchmark::run_test(test, Test::metric(), threads, *riter++); + std::string meminfo; + uint64_t memsize = test.size(); + if (memsize<1024) + meminfo = util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + meminfo = util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + meminfo = util::to_string(memsize/(1024*1024))+" MB"; + else + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; + benchmark->add(Test::label(), meminfo, results); + } catch(...) { gsInfo << "[failed!]"; } + gsInfo << "..."; + } + gsInfo << "\n"; + return benchmark; + } + + /// \brief Creates a new benchmark set, adds it to the benchmark and + /// returns a pointer to the benchmark set to the calling routine + template + gsBenchmarkSet* create(const util::zip_helper& sizes, + const std::vector & runs, + const std::vector & threads, + const std::string & extra_name="") + { + GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); + + auto benchmark = create(Test::label(), Test::name()+extra_name); + gsInfo << "[" << benchmark->get_label() << "] " << benchmark->get_descr() << "\n"; + + auto riter = runs.begin(); + for (auto it : sizes) { + gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; + try { + Test test(it); + auto results = gsBenchmark::run_test(test, Test::metric(), threads, *riter++); + std::string meminfo; + uint64_t memsize = test.size(); + if (memsize<1024) + meminfo = util::to_string(memsize)+" B"; + else if (memsize<1024*1024) + meminfo = util::to_string(memsize/1024)+" KB"; + else if (memsize<1024*1024*1024) + meminfo = util::to_string(memsize/(1024*1024))+" MB"; + else + meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; + benchmark->add(Test::label(), meminfo, results); + } catch(...) { gsInfo << "[failed!]"; } + gsInfo << "..."; + } + gsInfo << "\n"; + return benchmark; + } + +private: + /// \brief Runs the benchmark instance \a benchmark for the + /// specified number of \a threads and \a runs and returns an \a + /// std::vector of \a gsBenchmarkResult that represent the + /// respective benchmark results measured in the specified \a metric + template + static std::vector + run_test(Test& test, gismo::metric metric, + const std::vector& threads, index_t runs) + { gsStopwatch stopwatch; uint64_t result(0); real_t value, runtime; - - std::vector results; + + std::vector results; try { - for (auto it=nthreads.cbegin(); it!=nthreads.cend(); ++it) { + for (auto it=threads.cbegin(); it!=threads.cend(); ++it) { omp_set_num_threads(*it); runtime = 0.0; stopwatch.restart(); - for (index_t run=0; run(*it); // number of OpenMP threads - res.runtime = runtime; // averaged elapsed time in seconds - res.value = value; // averaged benchmark value - res.metric = metric; // benchmark metric - results.push_back( give(res) ); + gsBenchmarkResult result; + result.threads = static_cast(*it); // number of OpenMP threads + result.runtime = runtime; // averaged elapsed time in seconds + result.value = value; // averaged benchmark value + result.metric = metric; // benchmark metric + results.push_back( give(result) ); } } catch(...) {} From bf4d01b6cebb3e3963030236e7133f059217ff67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 5 Jan 2022 15:00:27 +0100 Subject: [PATCH 119/174] Added XML input/output for benchmark framework --- src/gsIO/gsBenchmark.cpp | 113 +++++---- src/gsIO/gsBenchmark.h | 491 +++++++++++++++++++++++++++++---------- 2 files changed, 429 insertions(+), 175 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 2ba38a4ea5..136e530855 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -24,8 +24,8 @@ namespace gismo os << "\\pgfplotstableread[col sep=space]{\n" << label << "\n"; - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << it->value << "\n"; + for (const auto& it : results) + os << it.value << "\n"; os << "}\\data" << label << "\n"; @@ -34,8 +34,8 @@ namespace gismo std::ostream &gsBenchmarkSet::to_tikz(std::ostream &os) const { - for (auto it=results.cbegin(); it!=results.cend(); ++it) - (*it)->to_tikz(os); + for (const auto& it : results) + it.to_tikz(os); os << "\\begin{tikzpicture}\n" << "\\begin{axis}[\n" @@ -49,35 +49,44 @@ namespace gismo << "xticklabel style={rotate=45,anchor=east},\n" << "xticklabels={"; - for (auto rit=results.cbegin(); rit!=results.cend(); ++rit) - os << (*rit)->get_descr() << (rit!=results.cend()-1 ? "," : ""); + for (const auto& it : results) + os << it.get_descr() << (&it != &results.back() ? "," : ""); os << "},\n" << "xtick=data,\n"; - auto it = results.front()->get().cbegin(); - if (it->metric & gismo::metric::speedup) { - switch(it->metric & ~gismo::metric::speedup) { + auto metric = results.front().get().cbegin()->metric; + if (metric & gismo::metric::speedup || metric & gismo::metric::ratio) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: case gismo::metric::bandwidth_mb_sec: case gismo::metric::bandwidth_gb_sec: case gismo::metric::bandwidth_tb_sec: - os << "ylabel={Bandwidth [speedup]},\n"; + os << "ylabel={Bandwidth [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; break; case gismo::metric::perf_kflop_sec: case gismo::metric::perf_mflop_sec: case gismo::metric::perf_gflop_sec: case gismo::metric::perf_tflop_sec: - os << "ylabel={Performance [speedup]},\n"; + os << "ylabel={Performance [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; break; case gismo::metric::runtime_sec: - os << "ylabel={Runtime [speedup]},\n"; + os << "ylabel={Runtime [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "]},\n"; break; default: GISMO_ERROR("Unsupported metric"); - } + } } else { - switch(it->metric & ~gismo::metric::speedup) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: os << "ylabel={Bandwidth in KB/s},\n"; break; @@ -112,34 +121,32 @@ namespace gismo os << "title={" << descr << " [real\\_t:" << util::type::name() - << ", index\\_t:" << util::type::name() - << ", short\\_t:" << util::type::name()<< "]},\n" + << ", index\\_t:" << util::type::name() + << ", short\\_t:" << util::type::name()<< "]},\n" << "]\n"; for (auto rit=results.cbegin()+1; rit!=results.cend(); ++rit) os << "\\pgfplotstablecreatecol[copy column from " << "table={\\data" - << (*rit)->get_label() + << rit->get_label() << "}{[index] 0}] {" - << (*rit)->get_label() + << rit->get_label() << "} {\\data" - << (*results.cbegin())->get_label() + << results.cbegin()->get_label() << "}\n"; os << "\\pgfplotstabletranspose[rows/threads/.style={string type}]\\mytable{" << "\\data" - << (*results.cbegin())->get_label() + << results.cbegin()->get_label() << "}\n"; - for (std::size_t i=1; i<=results.front()->get().size(); ++i) + for (std::size_t i=1; i<=results.front().get().size(); ++i) os << "\\addplot table[x expr=\\coordindex, y index=" << util::to_string(i) << "]{\\mytable};\n"; os << "\\legend{"; - it = results.front()->get().cbegin(); - auto ite = results.front()->get().cend(); - for (;it!=ite; ++it) - os << "Threads=" << it->threads << (it!=ite-1 ? "," : ""); + for (const auto& it : results.front().get()) + os << "Threads=" << it.threads << (&it!=&results.front().get().back() ? "," : ""); os << "}\n" << "\\end{axis}\n" @@ -166,9 +173,9 @@ namespace gismo std::string flags = jit.getFlags(); os << "Compiler flags "; - for (auto token=strtok(&flags[0], " "); token!=NULL; token=strtok(NULL, " ")) { - if (token[0]=='-') { - if (token[1]=='I' || token[1]=='L' || token[1]=='l' || token[1]=='W') + for (auto token = strtok(&flags[0], " "); token != NULL; token = strtok(NULL, " ")) { + if (token[0] == '-') { + if (token[1] == 'I' || token[1] == 'L' || token[1] == 'l' || token[1] == 'W') continue; os << "\\verb!" << token << "! "; } @@ -197,8 +204,8 @@ namespace gismo << "\\begin{document}\n" << "\\usetikzlibrary{calc}\n"; - for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) - (*it)->to_tikz(os); + for (const auto& it : benchmarks) + it.to_tikz(os); os << "\\end{document}\n"; return os; @@ -207,43 +214,57 @@ namespace gismo std::ostream &gsBenchmarkResultSet::print(std::ostream &os) const { os << std::setw(8) << descr << " | "; - for (auto it=results.cbegin(); it!=results.cend(); ++it) - os << std::setw(4) << it->threads << " : " - << std::setw(6) << std::scientific << std::setprecision(2) << it->value; + for (const auto& it : results) + os << std::setw(4) << it.threads << " : " + << std::setw(6) << std::scientific << std::setprecision(2) << it.value; os << "\n"; return os; } std::ostream &gsBenchmarkSet::print(std::ostream &os) const { - os << "[" << label << "] " << descr << "\n" - << std::setw(8) << "memsize" + os << "[" << label << "] " << descr << "\n"; + + if (results.size() == 0) + return os; + + os << std::setw(8) << "memsize" << " | " - << util::to_string(results.front()->get().size()) + << util::to_string(results.front().get().size()) << "x (#Threads : "; - if (results.front()->get().cbegin()->metric & gismo::metric::speedup) { - switch(results.front()->get().cbegin()->metric & ~gismo::metric::speedup) { + auto metric = results.front().get().cbegin()->metric; + if (metric & gismo::metric::speedup || metric & gismo::metric::ratio) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: case gismo::metric::bandwidth_mb_sec: case gismo::metric::bandwidth_gb_sec: case gismo::metric::bandwidth_tb_sec: - os << "Bandwidth [speedup])\n"; + os << "Bandwidth [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; break; case gismo::metric::perf_kflop_sec: case gismo::metric::perf_mflop_sec: case gismo::metric::perf_gflop_sec: case gismo::metric::perf_tflop_sec: - os << "Performance [speedup])\n"; + os << "Performance [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; break; case gismo::metric::runtime_sec: - os << "Runtime [speedup])\n"; + os << "Runtime [" + << (metric & gismo::metric::speedup ? "speedup" : "") + << (metric & gismo::metric::ratio ? "ratio" : "") + << "])\n"; break; default: GISMO_ERROR("Unsupported metric"); } } else { - switch(results.front()->get().cbegin()->metric & ~gismo::metric::speedup) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: os << "Bandwidth in KB/s)\n"; break; @@ -276,15 +297,15 @@ namespace gismo } } - for (auto it=results.cbegin(); it!=results.cend(); ++it) - (*it)->print(os); + for (const auto& it : results) + it.print(os); return os; } std::ostream &gsBenchmark::print(std::ostream &os) const { - for (auto it=benchmarks.cbegin(); it!=benchmarks.cend(); ++it) - (*it)->print(os); + for (const auto& it : benchmarks) + it.print(os); return os; } diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index de26f8168f..d2e5435809 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include namespace gismo @@ -24,20 +25,27 @@ namespace gismo These definitions are used to control the output of the benchmark framework */ enum metric : uint64_t { - speedup = 1 << 0, - bandwidth_kb_sec = 1 << 1, - bandwidth_mb_sec = 1 << 2, - bandwidth_gb_sec = 1 << 3, - bandwidth_tb_sec = 1 << 4, - perf_kflop_sec = 1 << 5, - perf_mflop_sec = 1 << 6, - perf_gflop_sec = 1 << 7, - perf_tflop_sec = 1 << 8, - runtime_sec = 1 << 9 + speedup = 1 << 0, + ratio = 1 << 1, + bandwidth_kb_sec = 1 << 2, + bandwidth_mb_sec = 1 << 3, + bandwidth_gb_sec = 1 << 4, + bandwidth_tb_sec = 1 << 5, + perf_kflop_sec = 1 << 6, + perf_mflop_sec = 1 << 7, + perf_gflop_sec = 1 << 8, + perf_tflop_sec = 1 << 9, + runtime_sec = 1 << 10 }; /** - @brief Struct that represents a single benchmark result + @brief Class that represents a single benchmark result + + A \a gsBenchmarkResult object is the most atomic unit of the + benchmark framework. It represents the result of a single run for a + fixed problem size and configuration and a fixed number of + threads. A series of runs for different numbers of threads is + collected in a \a gsBenchmarkResultSet object. */ class gsBenchmarkResult { @@ -48,25 +56,82 @@ class gsBenchmarkResult gismo::metric metric; }; +namespace internal +{ +/// @brief Get a gsBenchmarkResult from XML data +template<> +class gsXml< gsBenchmarkResult > +{ +private: + gsXml() { } + typedef gsBenchmarkResult Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkResult"; } + static std::string type () { return "BenchmarkResult"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + + child = node->first_node("threads"); + if (child != NULL) obj.threads = atoi(child->value()); + + child = node->first_node("runtime"); + if (child != NULL) obj.runtime = atof(child->value()); + + child = node->first_node("value"); + if (child != NULL) obj.value = atof(child->value()); + + child = node->first_node("metric"); + if (child != NULL) obj.metric = (gismo::metric)atol(child->value()); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * result = makeNode("BenchmarkResult", data); + + result->append_node( makeNode("threads", util::to_string(obj.threads), data) ); + result->append_node( makeNode("runtime", util::to_string(obj.runtime), data) ); + result->append_node( makeNode("value", util::to_string(obj.value), data) ); + result->append_node( makeNode("metric", util::to_string(obj.metric), data) ); + + return result; + } +}; +} // namespace internal + /** - @brief Struct that represents a collection of benchmark results for - a single benchmark instance + @brief Class that represents a set of benchmark results - This struct can be used to hold a series of results of a single - benchmark instance (i.e. fixed problem size and problem - configuration) for different numbers of threads. + A \a gsBenchmarkResultSet object holds a set of benchmark results + (\a gsBenchmarkResult) for a fixed problem size and configuration + but for different numbers of threads. */ class gsBenchmarkResultSet { public: + /// \brief Default constructor + gsBenchmarkResultSet() = default; + /// \brief Constructor gsBenchmarkResultSet(const std::string& label, const std::string& descr, const std::vector& results) : label(label), descr(descr), - results(results) {} - + results( give(std::vector(results)) ) {} + + /// \brief Constructor + gsBenchmarkResultSet(const std::string& label, + const std::string& descr, + std::vector&& results) + : label(label), + descr(descr), + results( give(results) ) {} + /// \brief Returns the label const std::string& get_label() const { return label; } @@ -79,6 +144,10 @@ class gsBenchmarkResultSet const std::vector& get() const { return results; } + /// \brief Returns non-constant reference to the results + std::vector& get() + { return results; } + /// \brief Serializes the content to LaTeX TIKZ std::ostream &to_tikz(std::ostream &os) const; @@ -86,44 +155,98 @@ class gsBenchmarkResultSet std::ostream &print(std::ostream &os) const; private: - const std::string label, descr; + std::string label, descr; std::vector results; }; +/// Print (as string) operator +inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkResultSet& obj) +{ return obj.print(os); } + +namespace internal +{ +/// @brief Get a gsBenchmarkResultSet from XML data +template<> +class gsXml< gsBenchmarkResultSet > +{ +private: + gsXml() { } + typedef gsBenchmarkResultSet Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkResultSet"; } + static std::string type () { return "BenchmarkResultSet"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + std::string label, descr; + + child = node->first_node("label"); + if (child != NULL) label = child->value(); + + child = node->first_node("descr"); + if (child != NULL) descr = child->value(); + + std::vector results; + + child = node->first_node(gsXml< gsBenchmarkResult >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkResult result; + gsXml< gsBenchmarkResult >::get_into(child, result); + results.push_back( give(result) ); + } + + obj = gsBenchmarkResultSet(label, descr, give(results)); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("BenchmarkResultSet", data); + + results->append_node( makeNode("label", obj.get_label(), data) ); + results->append_node( makeNode("descr", obj.get_descr(), data) ); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkResult >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + /** - @brief Struct that represents a collection of benchmark sets for a + @brief Class that represents a collection of benchmark sets for a series of benchmark instances This struct can be used to hold a series of benchmark instances - (i.e. a series of problem sizes and configurations) + (i.e. a series of problem sizes and configurations)< */ class gsBenchmarkSet { public: + /// \brief Default Constructor + gsBenchmarkSet() = default; + /// \brief Constructor - gsBenchmarkSet(const std::string& _label, - const std::string& _descr) - : id('A'), - label(_label), - descr(_descr) - {} - - /// \brief Destructor - ~gsBenchmarkSet() - { - for (auto it=results.begin(); it!=results.end(); ++it) - delete (*it); - } - - /// \brief Adds a benchmark to the benchmark set - void add(const std::string& _label, - const std::string& _descr, - const std::vector& _results) - { - this->results.emplace_back(new gsBenchmarkResultSet(_label+std::string(1,id++), - _descr, _results)); - } + gsBenchmarkSet(const std::string& label, + const std::string& descr, + const std::vector& results) + : label(label), + descr(descr), + results( give(std::vector(results)) ) {} + /// \brief Constructor + gsBenchmarkSet(const std::string& label, + const std::string& descr, + std::vector&& results) + : label(label), + descr(descr), + results( give(results) ) {} + /// \brief Returns the label const std::string& get_label() const { return label; } @@ -133,7 +256,11 @@ class gsBenchmarkSet { return descr; } /// \brief Returns constant reference to the result sets - const std::vector& get() const + const std::vector& get() const + { return results; } + + /// \brief Returns non-constant reference to the result sets + std::vector& get() { return results; } /// \brief Serializes the content to LaTeX TIKZ @@ -143,26 +270,84 @@ class gsBenchmarkSet std::ostream &print(std::ostream &os) const; private: - char id; - const std::string label,descr; - std::vector results; + std::string label, descr; + std::vector results; }; + +/// Print (as string) operator +inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkSet& obj) +{ return obj.print(os); } + +namespace internal +{ +/// @brief Get a gsBenchmarkSet from XML data +template<> +class gsXml< gsBenchmarkSet > +{ +private: + gsXml() { } + typedef gsBenchmarkSet Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "BenchmarkSet"; } + static std::string type () { return "BenchmarkSet"; } + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + std::string label, descr; + + child = node->first_node("label"); + if (child != NULL) label = child->value(); + + child = node->first_node("descr"); + if (child != NULL) descr = child->value(); + + std::vector results; + + child = node->first_node(gsXml< gsBenchmarkResultSet >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkResultSet _results; + gsXml< gsBenchmarkResultSet >::get_into(child, _results); + results.push_back( give(_results) ); + } + + obj = gsBenchmarkSet(label, descr, give(results) ); + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("BenchmarkSet", data); + + results->append_node( makeNode("label", obj.get_label(), data) ); + results->append_node( makeNode("descr", obj.get_descr(), data) ); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkResultSet >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + /** - @brief Class that collects all benchmark results + @brief Class that represents a collection of benchmarks + + This is the top-level class of the benchmark framework and the only + one that should be used by the user directly. */ class GISMO_EXPORT gsBenchmark { public: - /// \brief Destructor - ~gsBenchmark() - { - for (auto&& it : benchmarks) - delete (it); - } - /// \brief Returns constant reference to the benchmarks - const std::vector& get() const + const std::vector& get() const + { return benchmarks; } + + /// \brief Returns non-constant reference to the benchmarks + std::vector& get() { return benchmarks; } /// \brief Serializes the content to LaTeX TIKZ @@ -172,81 +357,37 @@ class GISMO_EXPORT gsBenchmark std::ostream &print(std::ostream &os) const; /// \brief Returns iterator to benchmark set - const gsBenchmarkSet* find(const std::string& label) const + const std::vector::const_iterator find(const std::string& label) const { - for (const auto& it : benchmarks) + for (auto it = benchmarks.cbegin(); it != benchmarks.cend(); ++it) if (it->get_label() == label) return it; - return nullptr; - } - - /// \brief Creates a new benchmark set, adds it to the benchmark and - /// returns a pointer to the benchmark set to the calling routine - gsBenchmarkSet* create(const std::string& _label, - const std::string& _descr) - { - benchmarks.emplace_back(new gsBenchmarkSet(_label, _descr)); - return benchmarks.back(); + return benchmarks.cend(); } /// \brief Creates a new benchmark set, adds it to the benchmark and /// returns a pointer to the benchmark set to the calling routine template - gsBenchmarkSet* create(const Iterator & sizes, - const std::vector & runs, - const std::vector & threads, - const std::string & extra_name="") + const gsBenchmarkSet& create(const Iterator & sizes, + const std::vector & runs, + const std::vector & threads, + const std::string & extra_descr="") { GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); + + gsInfo << "[" << Test::label() << "] " + << Test::descr()+extra_descr << "\n"; - auto benchmark = create(Test::label(), Test::name()+extra_name); - gsInfo << "[" << benchmark->get_label() << "] " << benchmark->get_descr() << "\n"; + std::vector results; + char id('A'); auto riter = runs.begin(); - for (auto it : sizes) { + for (const auto& it : sizes) { gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; try { Test test(it); - auto results = gsBenchmark::run_test(test, Test::metric(), threads, *riter++); - std::string meminfo; uint64_t memsize = test.size(); - if (memsize<1024) - meminfo = util::to_string(memsize)+" B"; - else if (memsize<1024*1024) - meminfo = util::to_string(memsize/1024)+" KB"; - else if (memsize<1024*1024*1024) - meminfo = util::to_string(memsize/(1024*1024))+" MB"; - else - meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; - benchmark->add(Test::label(), meminfo, results); - } catch(...) { gsInfo << "[failed!]"; } - gsInfo << "..."; - } - gsInfo << "\n"; - return benchmark; - } - - /// \brief Creates a new benchmark set, adds it to the benchmark and - /// returns a pointer to the benchmark set to the calling routine - template - gsBenchmarkSet* create(const util::zip_helper& sizes, - const std::vector & runs, - const std::vector & threads, - const std::string & extra_name="") - { - GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); - - auto benchmark = create(Test::label(), Test::name()+extra_name); - gsInfo << "[" << benchmark->get_label() << "] " << benchmark->get_descr() << "\n"; - - auto riter = runs.begin(); - for (auto it : sizes) { - gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; - try { - Test test(it); - auto results = gsBenchmark::run_test(test, Test::metric(), threads, *riter++); std::string meminfo; - uint64_t memsize = test.size(); if (memsize<1024) meminfo = util::to_string(memsize)+" B"; else if (memsize<1024*1024) @@ -255,14 +396,19 @@ class GISMO_EXPORT gsBenchmark meminfo = util::to_string(memsize/(1024*1024))+" MB"; else meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; - benchmark->add(Test::label(), meminfo, results); + + results.push_back( give(gsBenchmarkResultSet(Test::label()+std::string(1,id++), meminfo, + give(gsBenchmark::run(test, Test::metric(), threads, *riter++)))) ); } catch(...) { gsInfo << "[failed!]"; } gsInfo << "..."; } gsInfo << "\n"; - return benchmark; - } + gsBenchmarkSet benchmark(Test::label(), Test::descr()+extra_descr, give(results) ); + benchmarks.push_back( give(benchmark) ); + return benchmarks.back(); + } + private: /// \brief Runs the benchmark instance \a benchmark for the /// specified number of \a threads and \a runs and returns an \a @@ -270,19 +416,17 @@ class GISMO_EXPORT gsBenchmark /// respective benchmark results measured in the specified \a metric template static std::vector - run_test(Test& test, gismo::metric metric, - const std::vector& threads, index_t runs) + run(Test& test, gismo::metric metric, const std::vector& threads, index_t runs) { + std::vector results; gsStopwatch stopwatch; uint64_t result(0); real_t value, runtime; - std::vector results; - try { - for (auto it=threads.cbegin(); it!=threads.cend(); ++it) { + for (const auto& it : threads) { - omp_set_num_threads(*it); + omp_set_num_threads(it); runtime = 0.0; stopwatch.restart(); @@ -294,7 +438,7 @@ class GISMO_EXPORT gsBenchmark stopwatch.stop(); runtime = stopwatch.elapsed()/(real_t)runs; - switch(metric & ~gismo::metric::speedup) { + switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: case gismo::metric::perf_kflop_sec: value = 1e-03 * result / runtime; break; @@ -315,10 +459,10 @@ class GISMO_EXPORT gsBenchmark } gsBenchmarkResult result; - result.threads = static_cast(*it); // number of OpenMP threads - result.runtime = runtime; // averaged elapsed time in seconds - result.value = value; // averaged benchmark value - result.metric = metric; // benchmark metric + result.threads = static_cast(it); // number of OpenMP threads + result.runtime = runtime; // averaged elapsed time in seconds + result.value = value; // averaged benchmark value + result.metric = metric; // benchmark metric results.push_back( give(result) ); } } catch(...) {} @@ -328,7 +472,7 @@ class GISMO_EXPORT gsBenchmark runtime = results.front().runtime; value = results.front().value; - for (auto &it : results) { + for (auto& it : results) { it.runtime = runtime / it.runtime; it.value = value / it.value; } @@ -338,11 +482,100 @@ class GISMO_EXPORT gsBenchmark } private: - std::vector benchmarks; + std::vector benchmarks; }; /// Print (as string) operator inline std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) { return obj.print(os); } +namespace internal +{ +/// @brief Get a gsBenchmark from XML data +template<> +class gsXml< gsBenchmark > +{ +private: + gsXml() { } + typedef gsBenchmark Object; +public: + GSXML_COMMON_FUNCTIONS(Object); + static std::string tag () { return "Benchmark"; } + static std::string type () { return "Benchmark"; } + + GSXML_GET_POINTER(Object); + + static void get_into (gsXmlNode * node, Object & obj) + { + gsXmlNode * child; + + child = node->first_node(gsXml< gsBenchmarkSet >::tag().c_str()); + for (; child; child = child->next_sibling() ) { + gsBenchmarkSet benchmark; + gsXml< gsBenchmarkSet >::get_into(child, benchmark); + obj.get().push_back( give(benchmark) ); + } + } + + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) + { + gsXmlNode * results = makeNode("Benchmark", data); + + for (const auto& it : obj.get()) { + results->append_node( gsXml< gsBenchmarkSet >::put(it, data) ); + } + + return results; + } +}; +} // namespace internal + +namespace benchmark { + + /// \brief Returns the ratio of the two given benchmark result sets + gsBenchmarkResultSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkResultSet objA, + const gsBenchmarkResultSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark result sets must have the same size"); + + std::vector results; + for (const auto& it : util::zip(objA.get(), objB.get())) { + gsBenchmarkResult result; + result.threads = std::get<0>(it).threads; + result.runtime = std::get<0>(it).runtime / std::get<1>(it).runtime; + result.value = std::get<0>(it).value / std::get<1>(it).value; + result.metric = (gismo::metric)(std::get<0>(it).metric + gismo::metric::ratio); + results.push_back( give(result) ); + } + + return gsBenchmarkResultSet(label, descr, give(results) ); + } + + /// \brief Returns the ratio of the two given benchmark sets + gsBenchmarkSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkSet objA, + const gsBenchmarkSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark sets must have the same size"); + + std::vector results; + char id('A'); + + for (const auto& it : util::zip(objA.get(), objB.get())) { + results.push_back( give(benchmark::ratio(std::get<0>(it).get_label()+std::string(1,id++), + std::get<0>(it).get_descr(), + std::get<0>(it), + std::get<1>(it))) ); + } + + gsBenchmarkSet benchmark(label, descr, give(results) ); + return benchmark; + } +} + } // namespace gismo From aac53af76a43d82dd904398295714206c6f29e06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 5 Jan 2022 16:12:25 +0100 Subject: [PATCH 120/174] Updated performance benchmark application --- examples/performance_benchmark.cpp | 361 ++++++++++++++++------------- 1 file changed, 202 insertions(+), 159 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 24973952e9..7631159f7e 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -17,71 +17,16 @@ using namespace gismo; //! [Include namespace] -//! [Implement test creator] -template -void create_test(const std::string& label, - const Iterator& sizes, - const std::vector& nruns, - const std::vector& nthreads, - gsBenchmark& benchmark, - const std::string& extra_name="") -{ - gsInfo << "=== " << Test::name() << extra_name << "\n"; - auto bmark = benchmark.add(label, Test::name()+extra_name); - auto riter = nruns.begin(); - for (auto it : sizes) { - gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; - try { - Test test(it); - auto results = gsBenchmark::run(nthreads, *riter++, test, Test::metric()); - std::string meminfo; - uint64_t memsize = test.size(); - if (memsize<1024) - meminfo = util::to_string(memsize)+" B"; - else if (memsize<1024*1024) - meminfo = util::to_string(memsize/1024)+" KB"; - else if (memsize<1024*1024*1024) - meminfo = util::to_string(memsize/(1024*1024))+" MB"; - else - meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; - bmark->add(label, meminfo, results); - } catch(...) { gsInfo << "[failed!]"; } - gsInfo << "\n"; - } -} - -template -void create_test(const std::string& label, - const util::zip_helper& sizes, - const std::vector& nruns, - const std::vector& nthreads, - gsBenchmark& benchmark, - const std::string& extra_name="") +//! [Implement make_vector] +template +std::vector make_vector(T value, std::size_t size) { - gsInfo << "=== " << Test::name() << extra_name << "\n"; - auto bmark = benchmark.add(label, Test::name()+extra_name); - auto riter = nruns.begin(); - for (auto it : sizes) { - gsInfo << "... " << util::to_string(it) << "(" << *riter << ")"<< std::flush; - try { - Test test(it); - auto results = gsBenchmark::run(nthreads, *riter++, test, Test::metric()); - std::string meminfo; - uint64_t memsize = test.size(); - if (memsize<1024) - meminfo = util::to_string(memsize)+" B"; - else if (memsize<1024*1024) - meminfo = util::to_string(memsize/1024)+" KB"; - else if (memsize<1024*1024*1024) - meminfo = util::to_string(memsize/(1024*1024))+" MB"; - else - meminfo = util::to_string(memsize/(1024*1024*1024))+" GB"; - bmark->add(label, meminfo, results); - } catch(...) { gsInfo << "[failed!]"; } - gsInfo << "\n"; - } + std::vector v; + for (std::size_t i=0; i @@ -113,7 +58,7 @@ class benchmark_c_array_memcopy benchmark_c_array_memcopy(index_t n) : _msg(n), n(n), m_x(new T[n]), m_y(new T[n]) { -#pragma omp parallel for simd +#pragma omp parallel for simd schedule(static) for (index_t i=0; i -std::vector make_vector(T value, std::size_t size) -{ - std::vector v; - for (std::size_t i=0; i::name() << "\n" - << "#02: " << benchmark_eigen_memcopy::name() << "\n" - << "#03: " << benchmark_c_array_dotproduct::name() << "\n" - << "#04: " << benchmark_eigen_dotproduct::name() << "\n" - << "#05: " << benchmark_c_array_axpy::name() << "\n" - << "#06: " << benchmark_eigen_axpy::name() << "\n" - << "#07: " << benchmark_c_array_dense_matmul::name() << "\n" - << "#08: " << benchmark_eigen_dense_matmul::name() << "\n" - << "#09: " << benchmark_poisson2d_visitor::name() + << "#01: " << benchmark_c_array_memcopy::descr() << "\n" + << "#02: " << benchmark_eigen_memcopy::descr() << "\n" + << "#03: " << benchmark_c_array_dotproduct::descr() << "\n" + << "#04: " << benchmark_eigen_dotproduct::descr() << "\n" + << "#05: " << benchmark_c_array_axpy::descr() << "\n" + << "#06: " << benchmark_eigen_axpy::descr() << "\n" + << "#07: " << benchmark_c_array_dense_matmul::descr() << "\n" + << "#08: " << benchmark_eigen_dense_matmul::descr() << "\n" + << "#09: " << benchmark_poisson2d_visitor::descr() << " with increasing number of patches" << "\n" - << "#10: " << benchmark_poisson2d_visitor::name() + << "#10: " << benchmark_poisson2d_visitor::descr() << " with increasing number of subdivisions" << "\n" - << "#11: " << benchmark_poisson3d_visitor::name() + << "#11: " << benchmark_poisson3d_visitor::descr() << " with increasing number of patches" << "\n" - << "#12: " << benchmark_poisson3d_visitor::name() + << "#12: " << benchmark_poisson3d_visitor::descr() << " with increasing number of subdivisions" << "\n" - << "#13: " << benchmark_poisson2d_expression_assembler::name() + << "#13: " << benchmark_poisson2d_expression_assembler::descr() << " with increasing number of patches" << "\n" - << "#14: " << benchmark_poisson2d_expression_assembler::name() + << "#14: " << benchmark_poisson2d_expression_assembler::descr() << " with increasing number of subdivisions" << "\n"; return EXIT_SUCCESS; @@ -1003,120 +994,119 @@ int main(int argc, char *argv[]) case (1): { // Benchmark: memcopy native C arrays - create_test > - ("memcopyCarray", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (2): { // Benchmark: memcopy gsVector - create_test > - ("memcopyEigen", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (3): { // Benchmark: dot-product native C array - create_test > - ("dotproductCarray", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (4): { // Benchmark: dot-product gsVector - create_test > - ("dotproductEigen", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (5): { // Benchmark: axpy native C array - create_test > - ("axpyCarray", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (6): { // Benchmark: axpy gsVector - create_test > - ("axpyEigen", vsizes, nruns, nthreads, benchmark); + benchmark.create > + (vsizes, nruns, nthreads); break; } case (7): { // Benchmark: dense matrix-vector multiplication native C array - create_test > - ("densematmulCarray", msizes, nruns, nthreads, benchmark); + benchmark.create > + (msizes, nruns, nthreads); break; } case (8): { // Benchmark: dense matrix-vector multiplication gsMatrix/gsVector - create_test > - ("densematmulEigen", msizes, nruns, nthreads, benchmark); + benchmark.create > + (msizes, nruns, nthreads); break; } case (9): { // Benchmark: visitor-based Poisson 2d assembler with increasing number of patches - create_test > - ("assemblerVisitor", util::zip(patches, - make_vector((index_t)1, patches.size()), - make_vector((index_t)3, patches.size())), - nruns, nthreads, benchmark, " with increasing number of patches"); + benchmark.create > + (util::zip(patches, + make_vector((index_t)1, patches.size()), + make_vector((index_t)3, patches.size())), + nruns, nthreads, " with increasing number of patches"); break; } case (10): { // Benchmark: visitor-based Poisson 2d assembler with increasing number of subdivisions - create_test > - ("assemblerVisitor", util::zip(make_vector((index_t)4, subdivides.size()), - subdivides, - make_vector((index_t)3, subdivides.size())), - nruns, nthreads, benchmark, " with increasing number of subdivisions"); + benchmark.create > + (util::zip(make_vector((index_t)4, subdivides.size()), + subdivides, + make_vector((index_t)3, subdivides.size())), + nruns, nthreads, " with increasing number of subdivisions"); break; } case (11): { // Benchmark: visitor-based Poisson 3d assembler with increasing number of patches - create_test > - ("assemblerVisitor", util::zip(patches, - make_vector((index_t)0, patches.size()), - make_vector((index_t)1, patches.size())), - nruns, nthreads, benchmark, " with increasing number of patches"); + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), + make_vector((index_t)1, patches.size())), + nruns, nthreads, " with increasing number of patches"); break; } case (12): { // Benchmark: visitor-based Poisson 3d assembler with increasing number of subdivisions - create_test > - ("assemblerVisitor", util::zip(make_vector((index_t)1, subdivides.size()), - subdivides, - make_vector((index_t)2, subdivides.size())), - nruns, nthreads, benchmark, " with increasing number of subdivisions"); + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), + subdivides, + make_vector((index_t)2, subdivides.size())), + nruns, nthreads, " with increasing number of subdivisions"); break; } case (13): { // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches - create_test > - ("assemblerExpressionAssembler", util::zip(patches, - make_vector((index_t)1, patches.size()), - make_vector((index_t)3, patches.size())), - nruns, nthreads, benchmark, " with increasing number of patches"); + benchmark.create > + (util::zip(patches, + make_vector((index_t)1, patches.size()), + make_vector((index_t)3, patches.size())), + nruns, nthreads, " with increasing number of patches"); break; } case (14): { // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of subdivision - create_test > - ("assemblerExpressionAssembler", util::zip(make_vector((index_t)4, subdivides.size()), - subdivides, - make_vector((index_t)3, subdivides.size())), - nruns, nthreads, benchmark, " with increasing number of subdivisions"); + benchmark.create > + (util::zip(make_vector((index_t)4, subdivides.size()), + subdivides, + make_vector((index_t)3, subdivides.size())), + nruns, nthreads, " with increasing number of subdivisions"); break; } - default: GISMO_ERROR("Invalid benchmark"); @@ -1124,15 +1114,68 @@ int main(int argc, char *argv[]) } // benchmark loop + { // Memory copy ratio + auto bmA = benchmark.find(benchmark_c_array_memcopy::label()); + auto bmB = benchmark.find(benchmark_eigen_memcopy::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = benchmark::ratio("memcopyRatio", + "Memory copy (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // Dot product ratio + auto bmA = benchmark.find(benchmark_c_array_dotproduct::label()); + auto bmB = benchmark.find(benchmark_eigen_dotproduct::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = benchmark::ratio("dotproductRatio", + "Dot product (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // AXPY ratio + auto bmA = benchmark.find(benchmark_c_array_axpy::label()); + auto bmB = benchmark.find(benchmark_eigen_axpy::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = benchmark::ratio("axpyRatio", + "AXPY (gsVector : native C array)", *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + + { // Dense matrix-vector multiplication ratio + auto bmA = benchmark.find(benchmark_c_array_dense_matmul::label()); + auto bmB = benchmark.find(benchmark_eigen_dense_matmul::label()); + + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { + auto bm = benchmark::ratio("densematmulRatio", + "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", + *bmB, *bmA); + benchmark.get().push_back( give(bm) ); + } + } + if (fn.empty()) gsInfo << benchmark << "\n"; - else { + else if (gsFileManager::getExtension(fn) == "tex") { std::ofstream file; file.open(fn); benchmark.to_tikz(file); file.close(); } + else if (gsFileManager::getExtension(fn) == "xml") { + gsFileData<> file; + file << benchmark; + file.save("result.xml"); + } + else { + GISMO_ERROR("Unsupported file extension"); + } //! [Execute benchmarks] - + return EXIT_SUCCESS; } From 571a9790701836fe7420100b7d44baac57fed0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 10 Jan 2022 09:41:06 +0100 Subject: [PATCH 121/174] code cleanup --- extensions/gsXBraid/CMakeLists.txt | 2 +- src/gsIO/gsBenchmark.cpp | 8 +-- src/gsIO/gsBenchmark.h | 104 ++++++++++++++--------------- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/extensions/gsXBraid/CMakeLists.txt b/extensions/gsXBraid/CMakeLists.txt index 146ace445d..2a490a97ff 100644 --- a/extensions/gsXBraid/CMakeLists.txt +++ b/extensions/gsXBraid/CMakeLists.txt @@ -129,7 +129,7 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION include/gismo/gsXBraid FILES_MATCHING PATTERN "*.h") -# add filedata folder +# Add filedata folder add_definitions(-DXBRAID_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}/filedata/") # Add example files diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 136e530855..1c1bb41623 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -84,7 +84,7 @@ namespace gismo break; default: GISMO_ERROR("Unsupported metric"); - } + } } else { switch(metric & ~gismo::metric::speedup & ~gismo::metric::ratio) { case gismo::metric::bandwidth_kb_sec: @@ -227,7 +227,7 @@ namespace gismo if (results.size() == 0) return os; - + os << std::setw(8) << "memsize" << " | " << util::to_string(results.front().get().size()) @@ -296,7 +296,7 @@ namespace gismo GISMO_ERROR("Unsupported metric"); } } - + for (const auto& it : results) it.print(os); return os; @@ -308,5 +308,5 @@ namespace gismo it.print(os); return os; } - + } // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index d2e5435809..17be951ac0 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -21,7 +21,7 @@ namespace gismo { /** @brief Enumerator that defines the benchmark metrics. - + These definitions are used to control the output of the benchmark framework */ enum metric : uint64_t { @@ -37,7 +37,7 @@ enum metric : uint64_t { perf_tflop_sec = 1 << 9, runtime_sec = 1 << 10 }; - + /** @brief Class that represents a single benchmark result @@ -59,7 +59,7 @@ class gsBenchmarkResult namespace internal { /// @brief Get a gsBenchmarkResult from XML data -template<> +template<> class gsXml< gsBenchmarkResult > { private: @@ -69,16 +69,16 @@ class gsXml< gsBenchmarkResult > GSXML_COMMON_FUNCTIONS(Object); static std::string tag () { return "BenchmarkResult"; } static std::string type () { return "BenchmarkResult"; } - + GSXML_GET_POINTER(Object); - + static void get_into (gsXmlNode * node, Object & obj) { gsXmlNode * child; child = node->first_node("threads"); if (child != NULL) obj.threads = atoi(child->value()); - + child = node->first_node("runtime"); if (child != NULL) obj.runtime = atof(child->value()); @@ -88,7 +88,7 @@ class gsXml< gsBenchmarkResult > child = node->first_node("metric"); if (child != NULL) obj.metric = (gismo::metric)atol(child->value()); } - + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) { gsXmlNode * result = makeNode("BenchmarkResult", data); @@ -97,12 +97,12 @@ class gsXml< gsBenchmarkResult > result->append_node( makeNode("runtime", util::to_string(obj.runtime), data) ); result->append_node( makeNode("value", util::to_string(obj.value), data) ); result->append_node( makeNode("metric", util::to_string(obj.metric), data) ); - + return result; } }; } // namespace internal - + /** @brief Class that represents a set of benchmark results @@ -115,7 +115,7 @@ class gsBenchmarkResultSet public: /// \brief Default constructor gsBenchmarkResultSet() = default; - + /// \brief Constructor gsBenchmarkResultSet(const std::string& label, const std::string& descr, @@ -123,7 +123,7 @@ class gsBenchmarkResultSet : label(label), descr(descr), results( give(std::vector(results)) ) {} - + /// \brief Constructor gsBenchmarkResultSet(const std::string& label, const std::string& descr, @@ -131,7 +131,7 @@ class gsBenchmarkResultSet : label(label), descr(descr), results( give(results) ) {} - + /// \brief Returns the label const std::string& get_label() const { return label; } @@ -153,7 +153,7 @@ class gsBenchmarkResultSet /// \brief Pretty-prints the content std::ostream &print(std::ostream &os) const; - + private: std::string label, descr; std::vector results; @@ -166,7 +166,7 @@ inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkResultSet& ob namespace internal { /// @brief Get a gsBenchmarkResultSet from XML data -template<> +template<> class gsXml< gsBenchmarkResultSet > { private: @@ -176,9 +176,9 @@ class gsXml< gsBenchmarkResultSet > GSXML_COMMON_FUNCTIONS(Object); static std::string tag () { return "BenchmarkResultSet"; } static std::string type () { return "BenchmarkResultSet"; } - + GSXML_GET_POINTER(Object); - + static void get_into (gsXmlNode * node, Object & obj) { gsXmlNode * child; @@ -186,7 +186,7 @@ class gsXml< gsBenchmarkResultSet > child = node->first_node("label"); if (child != NULL) label = child->value(); - + child = node->first_node("descr"); if (child != NULL) descr = child->value(); @@ -201,23 +201,23 @@ class gsXml< gsBenchmarkResultSet > obj = gsBenchmarkResultSet(label, descr, give(results)); } - + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) { gsXmlNode * results = makeNode("BenchmarkResultSet", data); results->append_node( makeNode("label", obj.get_label(), data) ); results->append_node( makeNode("descr", obj.get_descr(), data) ); - + for (const auto& it : obj.get()) { results->append_node( gsXml< gsBenchmarkResult >::put(it, data) ); } - + return results; } }; } // namespace internal - + /** @brief Class that represents a collection of benchmark sets for a series of benchmark instances @@ -230,7 +230,7 @@ class gsBenchmarkSet public: /// \brief Default Constructor gsBenchmarkSet() = default; - + /// \brief Constructor gsBenchmarkSet(const std::string& label, const std::string& descr, @@ -246,7 +246,7 @@ class gsBenchmarkSet : label(label), descr(descr), results( give(results) ) {} - + /// \brief Returns the label const std::string& get_label() const { return label; } @@ -268,7 +268,7 @@ class gsBenchmarkSet /// \brief Pretty-prints the content std::ostream &print(std::ostream &os) const; - + private: std::string label, descr; std::vector results; @@ -281,7 +281,7 @@ inline std::ostream &operator<<(std::ostream &os, const gsBenchmarkSet& obj) namespace internal { /// @brief Get a gsBenchmarkSet from XML data -template<> +template<> class gsXml< gsBenchmarkSet > { private: @@ -291,22 +291,22 @@ class gsXml< gsBenchmarkSet > GSXML_COMMON_FUNCTIONS(Object); static std::string tag () { return "BenchmarkSet"; } static std::string type () { return "BenchmarkSet"; } - + GSXML_GET_POINTER(Object); - + static void get_into (gsXmlNode * node, Object & obj) { gsXmlNode * child; std::string label, descr; - + child = node->first_node("label"); if (child != NULL) label = child->value(); - + child = node->first_node("descr"); if (child != NULL) descr = child->value(); std::vector results; - + child = node->first_node(gsXml< gsBenchmarkResultSet >::tag().c_str()); for (; child; child = child->next_sibling() ) { gsBenchmarkResultSet _results; @@ -316,18 +316,18 @@ class gsXml< gsBenchmarkSet > obj = gsBenchmarkSet(label, descr, give(results) ); } - + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) { gsXmlNode * results = makeNode("BenchmarkSet", data); results->append_node( makeNode("label", obj.get_label(), data) ); results->append_node( makeNode("descr", obj.get_descr(), data) ); - + for (const auto& it : obj.get()) { results->append_node( gsXml< gsBenchmarkResultSet >::put(it, data) ); } - + return results; } }; @@ -364,7 +364,7 @@ class GISMO_EXPORT gsBenchmark return it; return benchmarks.cend(); } - + /// \brief Creates a new benchmark set, adds it to the benchmark and /// returns a pointer to the benchmark set to the calling routine template @@ -377,10 +377,10 @@ class GISMO_EXPORT gsBenchmark gsInfo << "[" << Test::label() << "] " << Test::descr()+extra_descr << "\n"; - - std::vector results; + + std::vector results; char id('A'); - + auto riter = runs.begin(); for (const auto& it : sizes) { gsInfo << util::to_string(it) << "(" << *riter << ")"<< std::flush; @@ -404,11 +404,11 @@ class GISMO_EXPORT gsBenchmark } gsInfo << "\n"; - gsBenchmarkSet benchmark(Test::label(), Test::descr()+extra_descr, give(results) ); + gsBenchmarkSet benchmark(Test::label(), Test::descr()+extra_descr, give(results) ); benchmarks.push_back( give(benchmark) ); return benchmarks.back(); } - + private: /// \brief Runs the benchmark instance \a benchmark for the /// specified number of \a threads and \a runs and returns an \a @@ -422,7 +422,7 @@ class GISMO_EXPORT gsBenchmark gsStopwatch stopwatch; uint64_t result(0); real_t value, runtime; - + try { for (const auto& it : threads) { @@ -492,7 +492,7 @@ inline std::ostream &operator<<(std::ostream &os, const gsBenchmark& obj) namespace internal { /// @brief Get a gsBenchmark from XML data -template<> +template<> class gsXml< gsBenchmark > { private: @@ -502,13 +502,13 @@ class gsXml< gsBenchmark > GSXML_COMMON_FUNCTIONS(Object); static std::string tag () { return "Benchmark"; } static std::string type () { return "Benchmark"; } - + GSXML_GET_POINTER(Object); - + static void get_into (gsXmlNode * node, Object & obj) { gsXmlNode * child; - + child = node->first_node(gsXml< gsBenchmarkSet >::tag().c_str()); for (; child; child = child->next_sibling() ) { gsBenchmarkSet benchmark; @@ -516,7 +516,7 @@ class gsXml< gsBenchmark > obj.get().push_back( give(benchmark) ); } } - + static gsXmlNode * put (const Object & obj, gsXmlTree & data ) { gsXmlNode * results = makeNode("Benchmark", data); @@ -524,7 +524,7 @@ class gsXml< gsBenchmark > for (const auto& it : obj.get()) { results->append_node( gsXml< gsBenchmarkSet >::put(it, data) ); } - + return results; } }; @@ -540,7 +540,7 @@ namespace benchmark { { GISMO_ASSERT(objA.get().size() == objB.get().size(), "Benchmark result sets must have the same size"); - + std::vector results; for (const auto& it : util::zip(objA.get(), objB.get())) { gsBenchmarkResult result; @@ -550,7 +550,7 @@ namespace benchmark { result.metric = (gismo::metric)(std::get<0>(it).metric + gismo::metric::ratio); results.push_back( give(result) ); } - + return gsBenchmarkResultSet(label, descr, give(results) ); } @@ -562,20 +562,20 @@ namespace benchmark { { GISMO_ASSERT(objA.get().size() == objB.get().size(), "Benchmark sets must have the same size"); - + std::vector results; char id('A'); - + for (const auto& it : util::zip(objA.get(), objB.get())) { results.push_back( give(benchmark::ratio(std::get<0>(it).get_label()+std::string(1,id++), std::get<0>(it).get_descr(), std::get<0>(it), std::get<1>(it))) ); } - + gsBenchmarkSet benchmark(label, descr, give(results) ); return benchmark; - } + } } } // namespace gismo From 727142a4fb4810506fefc6b1bb1bb30678c3d558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 10 Jan 2022 09:41:27 +0100 Subject: [PATCH 122/174] more performance benchmarks --- examples/performance_benchmark.cpp | 189 ++++++++++++++++++++++++----- 1 file changed, 156 insertions(+), 33 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 7631159f7e..ec29f998e0 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -703,7 +703,7 @@ class benchmark_poisson3d_visitor // // The factor 1.33 is used because Eigen shows better performance // if 33% more memory is allocated during the step-by-step assembly - return sizeof(T) * (numPatches * ((1< +class benchmark_poisson3d_expression_assembler +{ +private: + memory_safeguard _msg; + int numPatches, numRefine, degree; + gsMultiPatch geo; + gsMultiBasis bases; + gsBoundaryConditions bc; + + gsExprAssembler A; + typename gsExprAssembler<>::geometryMap G; + typename gsExprAssembler<>::space u; + + gsFunctionExpr f; + expr::gsComposition ff; + +public: + template + benchmark_poisson3d_expression_assembler(std::tuple args) + : benchmark_poisson3d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) + {} + + benchmark_poisson3d_expression_assembler(int numPatches, int numRefine=0, int degree=1) + : _msg(numPatches, numRefine, degree), + numPatches(numPatches), numRefine(numRefine), degree(degree), + geo(gsNurbsCreator<>::BSplineCubeGrid(numPatches, numPatches, numPatches, 1.0)), + bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), + f("0.0", 3), ff(A.getCoeff(f, G)) + { + // h-refine each basis + for (int i = 0; i < numRefine; ++i) + bases.uniformRefine(); + + // k-refinement (set degree) + for (std::size_t i = 0; i < bases.nBases(); ++ i) + bases[i].setDegreePreservingMultiplicity(degree); + + // set the geometry map to boundary conditions + bc.setGeoMap(geo); + + // setup boundary conditions + u.setup(bc, dirichlet::l2Projection, 0); + + // set elements used for numerical integration + A.setIntegrationElements(bases); + + // initialize the system + A.initSystem(); + } + + uint64_t operator()() + { + // Compute the system matrix and right-hand side + A.assemble( + igrad(u, G) * igrad(u, G).tr() * meas(G) //matrix + , + u * ff * meas(G) //rhs vector + ); + + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); + } + + constexpr uint64_t size() const + { + return size(numPatches, numRefine, degree); + } + + static constexpr uint64_t size(index_t numPatches, index_t numRefine, index_t degree) + { + // Estimated memory + // system matrix : 1.33 * ndofs * (2*p+1)^3 + // r.h.s. vector : ndofs + // + // The factor 1.33 is used because Eigen shows better performance + // if 33% more memory is allocated during the step-by-step assembly + return sizeof(T) * 1.33 * (numPatches * ((1< benchmarks, msizes, nruns, nthreads, patches, subdivides, vsizes; index_t msizemin = 10; index_t nrunsmax = 100; @@ -894,7 +991,8 @@ int main(int argc, char *argv[]) cmd.addMultiInt("v", "vsizes", "Number of unknowns in vector benchmarks (auto-generated if not given)", vsizes); cmd.addString("o", "output", "Name of the output file", fn); cmd.addSwitch("list", "List all benchmarks and exit", list); - + cmd.addSwitch("all", "Run all benchmarks", all); + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } //! [Parse command line] @@ -920,6 +1018,10 @@ int main(int argc, char *argv[]) << "#13: " << benchmark_poisson2d_expression_assembler::descr() << " with increasing number of patches" << "\n" << "#14: " << benchmark_poisson2d_expression_assembler::descr() + << " with increasing number of subdivisions" << "\n" + << "#15: " << benchmark_poisson3d_expression_assembler::descr() + << " with increasing number of patches" << "\n" + << "#16: " << benchmark_poisson3d_expression_assembler::descr() << " with increasing number of subdivisions" << "\n"; return EXIT_SUCCESS; @@ -928,8 +1030,9 @@ int main(int argc, char *argv[]) //! [Default configuration] // If empty fill with all benchmarks 1, 2, ... - if (benchmarks.empty()) { - for(index_t i=1; i<=8; ++i) + if (all) { + benchmarks.clear(); + for(index_t i=1; i<=16; ++i) benchmarks.push_back(i); } @@ -1052,19 +1155,19 @@ int main(int argc, char *argv[]) // Benchmark: visitor-based Poisson 2d assembler with increasing number of patches benchmark.create > (util::zip(patches, - make_vector((index_t)1, patches.size()), - make_vector((index_t)3, patches.size())), - nruns, nthreads, " with increasing number of patches"); + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)3, patches.size())), // degree : 3 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=3)"); break; } case (10): { // Benchmark: visitor-based Poisson 2d assembler with increasing number of subdivisions benchmark.create > - (util::zip(make_vector((index_t)4, subdivides.size()), + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 subdivides, - make_vector((index_t)3, subdivides.size())), - nruns, nthreads, " with increasing number of subdivisions"); + make_vector((index_t)3, subdivides.size())), // degree : 3 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=3)"); break; } @@ -1072,19 +1175,19 @@ int main(int argc, char *argv[]) // Benchmark: visitor-based Poisson 3d assembler with increasing number of patches benchmark.create > (util::zip(patches, - make_vector((index_t)0, patches.size()), - make_vector((index_t)1, patches.size())), - nruns, nthreads, " with increasing number of patches"); + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)2, patches.size())), // degree : 2 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=2)"); break; } case (12): { // Benchmark: visitor-based Poisson 3d assembler with increasing number of subdivisions benchmark.create > - (util::zip(make_vector((index_t)1, subdivides.size()), + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 subdivides, - make_vector((index_t)2, subdivides.size())), - nruns, nthreads, " with increasing number of subdivisions"); + make_vector((index_t)2, subdivides.size())), // degree : 2 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); break; } @@ -1092,19 +1195,39 @@ int main(int argc, char *argv[]) // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches benchmark.create > (util::zip(patches, - make_vector((index_t)1, patches.size()), - make_vector((index_t)3, patches.size())), - nruns, nthreads, " with increasing number of patches"); + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)3, patches.size())), // degree : 3 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=3)"); break; } case (14): { // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of subdivision benchmark.create > - (util::zip(make_vector((index_t)4, subdivides.size()), + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 + subdivides, + make_vector((index_t)3, subdivides.size())), // degree : 3 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=3)"); + break; + } + + case (15): { + // Benchmark: expression assembler-based Poisson 3d assembler with increasing number of patches + benchmark.create > + (util::zip(patches, + make_vector((index_t)0, patches.size()), // subdivisions : 0 + make_vector((index_t)2, patches.size())), // degree : 2 + nruns, nthreads, " with increasing number of patches (#subdivisions=0, degree=2)"); + break; + } + + case (16): { + // Benchmark: expression assembler-based Poisson 3d assembler with increasing number of subdivision + benchmark.create > + (util::zip(make_vector((index_t)1, subdivides.size()), // patches : 1 subdivides, - make_vector((index_t)3, subdivides.size())), - nruns, nthreads, " with increasing number of subdivisions"); + make_vector((index_t)2, subdivides.size())), // degree : 2 + nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); break; } From 57f1f6e1e1e6802e255d31b8126a30ddf0f36e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 10 Jan 2022 10:02:03 +0100 Subject: [PATCH 123/174] fixed duplicate symbol error --- src/gsIO/gsBenchmark.cpp | 46 ++++++++++++++++++++++++++++++++++++++++ src/gsIO/gsBenchmark.h | 41 ++++------------------------------- 2 files changed, 50 insertions(+), 37 deletions(-) diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 1c1bb41623..84d294413b 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -309,4 +309,50 @@ namespace gismo return os; } +namespace benchmark { + + gsBenchmarkResultSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkResultSet objA, + const gsBenchmarkResultSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark result sets must have the same size"); + + std::vector results; + for (const auto& it : util::zip(objA.get(), objB.get())) { + gsBenchmarkResult result; + result.threads = std::get<0>(it).threads; + result.runtime = std::get<0>(it).runtime / std::get<1>(it).runtime; + result.value = std::get<0>(it).value / std::get<1>(it).value; + result.metric = (gismo::metric)(std::get<0>(it).metric + gismo::metric::ratio); + results.push_back( give(result) ); + } + + return gsBenchmarkResultSet(label, descr, give(results) ); + } + + gsBenchmarkSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkSet objA, + const gsBenchmarkSet objB) + { + GISMO_ASSERT(objA.get().size() == objB.get().size(), + "Benchmark sets must have the same size"); + + std::vector results; + char id('A'); + + for (const auto& it : util::zip(objA.get(), objB.get())) { + results.push_back( give(benchmark::ratio(std::get<0>(it).get_label()+std::string(1,id++), + std::get<0>(it).get_descr(), + std::get<0>(it), + std::get<1>(it))) ); + } + + gsBenchmarkSet benchmark(label, descr, give(results) ); + return benchmark; + } +} // namespace benchmark + } // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 17be951ac0..b60acdaa06 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -536,46 +536,13 @@ namespace benchmark { gsBenchmarkResultSet ratio(const std::string& label, const std::string& descr, const gsBenchmarkResultSet objA, - const gsBenchmarkResultSet objB) - { - GISMO_ASSERT(objA.get().size() == objB.get().size(), - "Benchmark result sets must have the same size"); - - std::vector results; - for (const auto& it : util::zip(objA.get(), objB.get())) { - gsBenchmarkResult result; - result.threads = std::get<0>(it).threads; - result.runtime = std::get<0>(it).runtime / std::get<1>(it).runtime; - result.value = std::get<0>(it).value / std::get<1>(it).value; - result.metric = (gismo::metric)(std::get<0>(it).metric + gismo::metric::ratio); - results.push_back( give(result) ); - } - - return gsBenchmarkResultSet(label, descr, give(results) ); - } + const gsBenchmarkResultSet objB); - /// \brief Returns the ratio of the two given benchmark sets + /// \brief Returns the ratio of the two given benchmark sets gsBenchmarkSet ratio(const std::string& label, const std::string& descr, const gsBenchmarkSet objA, - const gsBenchmarkSet objB) - { - GISMO_ASSERT(objA.get().size() == objB.get().size(), - "Benchmark sets must have the same size"); - - std::vector results; - char id('A'); - - for (const auto& it : util::zip(objA.get(), objB.get())) { - results.push_back( give(benchmark::ratio(std::get<0>(it).get_label()+std::string(1,id++), - std::get<0>(it).get_descr(), - std::get<0>(it), - std::get<1>(it))) ); - } - - gsBenchmarkSet benchmark(label, descr, give(results) ); - return benchmark; - } -} + const gsBenchmarkSet objB); +} // namespace benchmark } // namespace gismo From 28ba25cb419d16f5ef8d9aae8e7481087be814a9 Mon Sep 17 00:00:00 2001 From: Angelos Mantzaflaris Date: Tue, 11 Jan 2022 11:50:20 +0100 Subject: [PATCH 124/174] small fix --- src/gsIO/gsBenchmark.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index b60acdaa06..5dda7f6ca4 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -373,7 +373,7 @@ class GISMO_EXPORT gsBenchmark const std::vector & threads, const std::string & extra_descr="") { - GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); + //GISMO_ASSERT(sizes.size()==runs.size(), "Problem sizes and number of runs must have the same length"); gsInfo << "[" << Test::label() << "] " << Test::descr()+extra_descr << "\n"; @@ -533,16 +533,16 @@ class gsXml< gsBenchmark > namespace benchmark { /// \brief Returns the ratio of the two given benchmark result sets - gsBenchmarkResultSet ratio(const std::string& label, - const std::string& descr, - const gsBenchmarkResultSet objA, - const gsBenchmarkResultSet objB); + GISMO_EXPORT gsBenchmarkResultSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkResultSet objA, + const gsBenchmarkResultSet objB); /// \brief Returns the ratio of the two given benchmark sets - gsBenchmarkSet ratio(const std::string& label, - const std::string& descr, - const gsBenchmarkSet objA, - const gsBenchmarkSet objB); + GISMO_EXPORT gsBenchmarkSet ratio(const std::string& label, + const std::string& descr, + const gsBenchmarkSet objA, + const gsBenchmarkSet objB); } // namespace benchmark } // namespace gismo From 2a81fc72e9a336178720c146214ecae02c759db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 14:41:24 +0100 Subject: [PATCH 125/174] fixed small bug --- src/gsCore/gsConfig.h.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gsCore/gsConfig.h.in b/src/gsCore/gsConfig.h.in index ed4e21315f..a683c1fd65 100644 --- a/src/gsCore/gsConfig.h.in +++ b/src/gsCore/gsConfig.h.in @@ -37,7 +37,7 @@ /** Define default dimension type. */ #define GISMO_SHORT_TYPE @GISMO_SHORT_TYPE@ #ifndef short_t -#define short_t GISMO_INDEX_TYPE +#define short_t GISMO_SHORT_TYPE #endif /** Define the file data directory. */ From f8461959ee0c885f82098961d813f446192e973b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 16:41:41 +0100 Subject: [PATCH 126/174] small bug in Circle CI configuration --- .circleci/config.yml | 39 +++++++++++++++++++++++++++++---------- README.md | 4 ++-- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a0ad125fd1..910f426930 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,7 @@ version: 2.0 jobs: - macos_x86_64_xcode10_cxx11_release: + x86_64:macos:xcode-10.3:cxx11:release: macos: xcode: "10.3.0" working_directory: ~/gismo @@ -14,14 +14,14 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode11_cxx11_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:maxos:xcode-11.3:cxx11:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - macos_x86_64_xcode12_cxx14_release: + x86_64:macos:xcode-11.7:cxx14:release: macos: - xcode: "12.5.1" + xcode: "11.7.0" working_directory: ~/gismo environment: MAKEJOBS: 4 @@ -32,12 +32,12 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode12_cxx14_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcod1.7:cxx14:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - macos_x86_64_xcode13_cxx17_release: + x86_64:macos:xcode-12.5:cxx17:release: macos: xcode: "12.5.1" working_directory: ~/gismo @@ -50,15 +50,34 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="macos_x86_64_xcode13_cxx17_release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcode-12.5:cxx17:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + x86_64:macos:xcode-13.2:cxx20:release: + macos: + xcode: "13.2.1" + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: brew install cmake + - checkout + - run: + name: Configure G+Smo on MacOS + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcode-13.2:cxx20:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=20 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on MacOS + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + workflows: version: 2 build: jobs: - - macos_x86_64_xcode10_cxx11_release - - macos_x86_64_xcode11_cxx14_release - - macos_x86_64_xcode12_cxx17_release + - x86_64:macos:xcode-10.3:cxx11:release + - x86_64:macos:xcode-11.7:cxx14:release + - x86_64:macos:xcode-12.5:cxx17:release + - x86_64:macos:xcode-13.2:cxx20:release diff --git a/README.md b/README.md index b85fe0dd5e..4ccc6bdeb0 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,11 @@ |------------|------------|----------------------| | [CDash](https://cdash-ci.inria.fr/index.php?project=Gismo) | [![cdash](https://img.shields.io/website?down_color=lightgrey&down_message=offline&label=CDash&up_color=green&up_message=up&url=https%3A%2F%2Fcdash-ci.inria.fr%2Findex.php%3Fproject%3DGismo)](https://cdash-ci.inria.fr/index.php?project=Gismo) | Report results from all builds | | [Appveyor](https://ci.appveyor.com/project/gismo/gismo) | [![Appveyor status](https://ci.appveyor.com/api/projects/status/abps59xbt1gjwci1/branch/stable?svg=true)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[appVeyor]) | Windows MSVC 14.0 | -| [Circle CI](https://circleci.com/gh/gismo/gismo) | [![Circle CI](https://circleci.com/gh/gismo/gismo.svg?style=svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[cci]) | MacOS XCode9-12 | +| [Circle CI](https://circleci.com/gh/gismo/gismo) | [![Circle CI](https://circleci.com/gh/gismo/gismo.svg?style=svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[cci]) | MacOS XCode 10-13 | | [Codeship](https://app.codeship.com/projects/123289) | [![Codeship Status](https://app.codeship.com/projects/2aa19360-8998-0133-39fd-66416d65b267/status?branch=stable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[codeship]) | | | [GitLab](https://gitlab.com/gismo-ci/gismo/-/pipelines) | [![pipeline status](https://gitlab.com/gismo-ci/gismo/badges/gitlab_ci/pipeline.svg)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[gitlab-ci]) | Linux non-default configurations | | [GitHub Actions](https://github.com/gismo/gismo/actions) | [![Build Status](https://github.com/gismo/gismo/workflows/gismo/badge.svg?branch=stable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[actions]) | Latest Linux/MacOS/Windows | -| [Jenkins](https://ci.inria.fr/gismo/job/gismo/job/gismo/job/stable) | [![Build Status](https://ci.inria.fr/gismo/buildStatus/icon?job=gismo%2Fgismo%2Fstable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[jenkins]) |VMs for Linux/MacOS/Windows | +| [Jenkins](https://ci.inria.fr/gismo/job/gismo/job/gismo/job/stable) | [![Build Status](https://ci.inria.fr/gismo/buildStatus/icon?job=gismo%2Fgismo%2Fstable)](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[jenkins]) | VMs for Linux/MacOS/Windows | | GCC Farm | [Status](https://cdash-ci.inria.fr/index.php?project=Gismo&filtercount=1&field1=site&compare1=63&value1=[gccfarm]) | Builders from the GCC Farm | | [OBS](https://build.opensuse.org/package/show/home:filiatra/gismo) | [binaries](https://software.opensuse.org/download/package?project=home:filiatra&package=gismo) | Upstream package builds for many Linux distributions | | [Launchpad](https://code.launchpad.net/~g+smo/+recipe/g+smo-daily) |[binaries](https://launchpad.net/~g+smo/+archive/ubuntu/upstream/+packages) | Upstream package builds for Ubuntu distributions | From d0675ef017e1deae7786327d3e369ec0bf1f330e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 16:55:58 +0100 Subject: [PATCH 127/174] Aarch64 builds in Circle CI configuration --- .circleci/config.yml | 51 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 910f426930..ebb343f305 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,11 @@ -version: 2.0 +version: 2.1 jobs: + + ### + ### XCode 10.3.0, macOS 10.14.4 (Mojave), x86_64 + ### + x86_64:macos:xcode-10.3:cxx11:release: macos: xcode: "10.3.0" @@ -19,6 +24,10 @@ jobs: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + ### + ### XCode 11.7.0, macOS 10.15.5 (Catalina), x86_64 + ### + x86_64:macos:xcode-11.7:cxx14:release: macos: xcode: "11.7.0" @@ -37,6 +46,10 @@ jobs: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + ### + ### XCode 12.5.1, macOS 11.4.0 (Big Sur), x86_64 + ### + x86_64:macos:xcode-12.5:cxx17:release: macos: xcode: "12.5.1" @@ -55,6 +68,10 @@ jobs: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + ### + ### XCode 13.2.1, macOS 11.6.2 (Big Sur), x86_64 + ### + x86_64:macos:xcode-13.2:cxx20:release: macos: xcode: "13.2.1" @@ -73,11 +90,35 @@ jobs: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + ### + ### GCC-latest, linux Ubuntu 20.04, aarch64 + ### + + aarch64:linux:gcc-latest:cxx11:release: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.medium + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: apt-get update -y && apt-get install cmake gcc g++ + - checkout + - run: + name: Configure G+Smo on Linux + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="aarch64:linux:gcc-latest:cxx11:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on Linux + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + workflows: version: 2 build: jobs: - - x86_64:macos:xcode-10.3:cxx11:release - - x86_64:macos:xcode-11.7:cxx14:release - - x86_64:macos:xcode-12.5:cxx17:release - - x86_64:macos:xcode-13.2:cxx20:release + - aarch64:linux:gcc-latest:cxx11:release +# - x86_64:macos:xcode-10.3:cxx11:release +# - x86_64:macos:xcode-11.7:cxx14:release +# - x86_64:macos:xcode-12.5:cxx17:release +# - x86_64:macos:xcode-13.2:cxx20:release From 7cdec6f3f6332f9aab8c3706cbcab84c932d7401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 17:01:34 +0100 Subject: [PATCH 128/174] Updated config.yml --- .circleci/config.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ebb343f305..8ba2460d4f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: ### XCode 10.3.0, macOS 10.14.4 (Mojave), x86_64 ### - x86_64:macos:xcode-10.3:cxx11:release: + x86_64_macos_xcode-103_cxx11_release: macos: xcode: "10.3.0" working_directory: ~/gismo @@ -28,7 +28,7 @@ jobs: ### XCode 11.7.0, macOS 10.15.5 (Catalina), x86_64 ### - x86_64:macos:xcode-11.7:cxx14:release: + x86_64_macos_xcode-117_cxx14_release: macos: xcode: "11.7.0" working_directory: ~/gismo @@ -50,7 +50,7 @@ jobs: ### XCode 12.5.1, macOS 11.4.0 (Big Sur), x86_64 ### - x86_64:macos:xcode-12.5:cxx17:release: + x86_64_macos_xcode-125_cxx17_release: macos: xcode: "12.5.1" working_directory: ~/gismo @@ -72,7 +72,7 @@ jobs: ### XCode 13.2.1, macOS 11.6.2 (Big Sur), x86_64 ### - x86_64:macos:xcode-13.2:cxx20:release: + x86_64_macos_xcode-132_cxx20_release: macos: xcode: "13.2.1" working_directory: ~/gismo @@ -94,7 +94,7 @@ jobs: ### GCC-latest, linux Ubuntu 20.04, aarch64 ### - aarch64:linux:gcc-latest:cxx11:release: + aarch64_linux_gcc-latest_cxx11_release: machine: image: ubuntu-2004:202101-01 resource_class: arm.medium @@ -117,7 +117,7 @@ workflows: version: 2 build: jobs: - - aarch64:linux:gcc-latest:cxx11:release + - aarch64_linux_gcc-latest_cxx11_release # - x86_64:macos:xcode-10.3:cxx11:release # - x86_64:macos:xcode-11.7:cxx14:release # - x86_64:macos:xcode-12.5:cxx17:release From 1ae907a334788f978fa9e60f16414aad9eeca418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 17:04:32 +0100 Subject: [PATCH 129/174] Aarch64 builds in Circle CI configuration --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8ba2460d4f..d5c6d18237 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -104,7 +104,7 @@ jobs: steps: - run: name: Install dependencies - command: apt-get update -y && apt-get install cmake gcc g++ + command: apt-get update -y && apt-get install cmake gcc g++ -y - checkout - run: name: Configure G+Smo on Linux From aad67726abbf50afb1ec6ef547997dc0c9be74d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 17:08:55 +0100 Subject: [PATCH 130/174] Aarch64 builds in Circle CI configuration --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d5c6d18237..bc6b1201eb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -104,7 +104,7 @@ jobs: steps: - run: name: Install dependencies - command: apt-get update -y && apt-get install cmake gcc g++ -y + command: sudo apt-get update -y && sudo apt-get install cmake gcc g++ -y - checkout - run: name: Configure G+Smo on Linux From cc7311715554252fc35fe59d794f36df432608a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 11 Jan 2022 18:42:41 +0100 Subject: [PATCH 131/174] Cleanup Circle CI configuration --- .circleci/config.yml | 78 +++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bc6b1201eb..fefe9da46f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,10 +3,10 @@ version: 2.1 jobs: ### - ### XCode 10.3.0, macOS 10.14.4 (Mojave), x86_64 + ### macOS 10.14.4 (Mojave), XCode 10.3.0 (x86_64) ### - - x86_64_macos_xcode-103_cxx11_release: + + macos_x_64_86_xcode-103_cxx11_release: macos: xcode: "10.3.0" working_directory: ~/gismo @@ -19,16 +19,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:maxos:xcode-11.3:cxx11:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode10.3-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS ### - ### XCode 11.7.0, macOS 10.15.5 (Catalina), x86_64 + ### macOS 10.15.5 (Catalina), XCode 11.7.0 (x86_64) ### - - x86_64_macos_xcode-117_cxx14_release: + + macos_x86_64_xcode-117_cxx14_release: macos: xcode: "11.7.0" working_directory: ~/gismo @@ -41,16 +41,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcod1.7:cxx14:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode1.7-cxx14-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=14 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS ### - ### XCode 12.5.1, macOS 11.4.0 (Big Sur), x86_64 + ### macOS 11.4.0 (Big Sur), XCode 12.5.1 (x86_64) ### - - x86_64_macos_xcode-125_cxx17_release: + + macos_x86_64_xcode-125_cxx17_release: macos: xcode: "12.5.1" working_directory: ~/gismo @@ -63,16 +63,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcode-12.5:cxx17:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode-12.5-cxx17-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=17 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS ### - ### XCode 13.2.1, macOS 11.6.2 (Big Sur), x86_64 + ### macOS 11.6.2 (Big Sur), XCode 13.2.1 (x86_64) ### - - x86_64_macos_xcode-132_cxx20_release: + + macos_x86_64_xcode-132_cxx20_release: macos: xcode: "13.2.1" working_directory: ~/gismo @@ -85,16 +85,16 @@ jobs: - checkout - run: name: Configure G+Smo on MacOS - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="x86_64:macos:xcode-13.2:cxx20:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=20 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="MacOS-x86_64-XCode-13.2-cxx20-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=20 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on MacOS - command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS ### - ### GCC-latest, linux Ubuntu 20.04, aarch64 + ### Ubuntu 20.04 Linux, GCC 9.3 (aarch64) ### - aarch64_linux_gcc-latest_cxx11_release: + linux_aarch64_gcc9_cxx11_release: machine: image: ubuntu-2004:202101-01 resource_class: arm.medium @@ -108,17 +108,41 @@ jobs: - checkout - run: name: Configure G+Smo on Linux - command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="aarch64:linux:gcc-latest:cxx11:release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="Linux-aarch64-gcc9-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON - run: name: Build and test G+Smo on Linux command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS - + + ### + ### Ubuntu 20.04 Linux, Clang 10 (aarch64) + ### + + linux_aarch64_clang10_cxx11_release: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.medium + working_directory: ~/gismo + environment: + MAKEJOBS: 4 + steps: + - run: + name: Install dependencies + command: sudo apt-get update -y && sudo apt-get install cmake clang -y + - checkout + - run: + name: Configure G+Smo on Linux + command: cmake . -DCMAKE_QUIET=ON -DBUILDNAME="Linux-aarch64-clang10-cxx11-Release" -DSITE="$CIRCLE_USERNAME-$CIRCLE_BRANCH [cci]" -DGISMO_INSOURCE_BUILD=ON -DGISMO_BUILD_UNITTESTS=ON -DCMAKE_CXX_STANDARD=11 -DGISMO_WITH_ONURBS=ON + - run: + name: Build and test G+Smo on Linux + command: ctest -S cmake/ctest_script.cmake -D KEEPCONFIG=ON -D CTEST_BUILD_JOBS=$MAKEJOBS + workflows: - version: 2 + version: 2.1 build: jobs: - - aarch64_linux_gcc-latest_cxx11_release -# - x86_64:macos:xcode-10.3:cxx11:release -# - x86_64:macos:xcode-11.7:cxx14:release -# - x86_64:macos:xcode-12.5:cxx17:release -# - x86_64:macos:xcode-13.2:cxx20:release + - macos_x_64_86_xcode-103_cxx11_release + - macos_x86_64_xcode-117_cxx14_release + - macos_x86_64_xcode-125_cxx17_release + - macos_x86_64_xcode-132_cxx20_release + - linux_aarch64_gcc9_cxx11_release + - linux_aarch64_clang10_cxx11_release From 54fac62309a07d84a994aecc73509cc5197d0e79 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Thu, 13 Jan 2022 16:52:10 +0100 Subject: [PATCH 132/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 692 ++++++++++++++++++---------- 1 file changed, 459 insertions(+), 233 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 6554933958..6c19e59be0 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -163,12 +163,15 @@ macro(OFA_AutodetectX86) # 3C | Haswell # # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid - if(_cpu_model EQUAL 133) # 85 + + # MIC architecture + if(_cpu_model EQUAL 133) set(TARGET_ARCHITECTURE "knm") # Knights Mill - elseif(_cpu_model EQUAL 87) # 57 + elseif(_cpu_model EQUAL 87) set(TARGET_ARCHITECTURE "knl") # Knights Landing + # Small cores elseif(_cpu_model EQUAL 134) set(TARGET_ARCHITECTURE "tremont") @@ -184,28 +187,29 @@ macro(OFA_AutodetectX86) elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) set(TARGET_ARCHITECTURE "bonnell") - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "sapphirerapids") + # Big cores + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) + set(TARGET_ARCHITECTURE "alderlake") + + elseif(_cpu_model EQUAL 143) + set(TARGET_ARCHITECTURE "sapphirerapids") - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "alderlake") + elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) + set(TARGET_ARCHITECTURE "kabylake") elseif(_cpu_model EQUAL 140) set(TARGET_ARCHITECTURE "tigerlake") + elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) + set(TARGET_ARCHITECTURE "icelake") + elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) set(TARGET_ARCHITECTURE "icelake-avx512") - elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) - set(TARGET_ARCHITECTURE "icelake") - elseif(_cpu_model EQUAL 102) set(TARGET_ARCHITECTURE "cannonlake") - elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) # 8E, 9E - set(TARGET_ARCHITECTURE "kabylake") - - elseif(_cpu_model EQUAL 85) # 55 + elseif(_cpu_model EQUAL 85) if(_cpu_stepping LESS 5) set(TARGET_ARCHITECTURE "skylake-avx512") elseif(_cpu_stepping LESS 8) @@ -214,10 +218,10 @@ macro(OFA_AutodetectX86) set(TARGET_ARCHITECTURE "cooperlake") endif() - elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E + elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) set(TARGET_ARCHITECTURE "skylake") - elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) # 3D, 47, 4F, 56 + elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) set(TARGET_ARCHITECTURE "broadwell") elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) @@ -284,13 +288,13 @@ macro(OFA_AutodetectX86) # 25 19h | Zen 3 if(_cpu_family EQUAL 25) # 19h - set(TARGET_ARCHITECTURE "zen3") # planned + set(TARGET_ARCHITECTURE "zen3") elseif(_cpu_family EQUAL 24) # 18h set(TARGET_ARCHITECTURE "zen") elseif(_cpu_family EQUAL 23) # 17h - if(_cpu_model LESS 64) + if(_cpu_model LESS 49) set(TARGET_ARCHITECTURE "zen") else() set(TARGET_ARCHITECTURE "zen2") @@ -456,8 +460,26 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "cortex-m23") elseif(_cpu_part STREQUAL "0xd21") set(TARGET_ARCHITECTURE "cortex-m33") + elseif(_cpu_part STREQUAL "0xd40") + set(TARGET_ARCHITECTURE "neoverse-v1") + elseif(_cpu_part STREQUAL "0xd41") + set(TARGET_ARCHITECTURE "cortex-a78") + elseif(_cpu_part STREQUAL "0xd42") + set(TARGET_ARCHITECTURE "cortex-a78ae") + elseif(_cpu_part STREQUAL "0xd44") + set(TARGET_ARCHITECTURE "cortex-x1") + elseif(_cpu_part STREQUAL "0xd46") + set(TARGET_ARCHITECTURE "cortex-a510") + elseif(_cpu_part STREQUAL "0xd47") + set(TARGET_ARCHITECTURE "cortex-a710") + elseif(_cpu_part STREQUAL "0xd48") + set(TARGET_ARCHITECTURE "cortex-x2") + elseif(_cpu_part STREQUAL "0xd49") + set(TARGET_ARCHITECTURE "neoverse-n2") elseif(_cpu_part STREQUAL "0xd4a") set(TARGET_ARCHITECTURE "neoverse-e1") + elseif(_cpu_part STREQUAL "0xd4b") + set(TARGET_ARCHITECTURE "cortex-a78c") endif() elseif(_cpu_implementer STREQUAL "0x42") # Broadcom @@ -499,6 +521,10 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "tsv110") endif() + elseif(_cpu_implementer STREQUAL "0x49") # Infineon + + elseif(_cpu_implementer STREQUAL "0x4d") # Motorola/Freescale + elseif(_cpu_implementer STREQUAL "0x4e") # Nvidia if(_cpu_part STREQUAL "0x000") set(TARGET_ARCHITECTURE "denver") @@ -552,6 +578,13 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "marvell-pj4") endif() + elseif(_cpu_implementer STREQUAL "0x61") # Apple + if(_cpu_part STREQUAL "0x022") + set(TARGET_ARCHITECTURE "icestorm") + elseif(_cpu_part STREQUAL "0x023") + set(TARGET_ARCHITECTURE "firestorm") + endif() + elseif(_cpu_implementer STREQUAL "0x66") # Faraday if(_cpu_part STREQUAL "0x526") set(TARGET_ARCHITECTURE "fa526") @@ -604,6 +637,15 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "ipx1200") endif() + elseif(_cpu_implementer STREQUAL "0x70") # Phytium + if(_cpu_part STREQUAL "0x662") + set(TARGET_ARCHITECTURE "ftc662") + elseif(_cpu_part STREQUAL "0x663") + set(TARGET_ARCHITECTURE "ftc663") + endif() + + elseif(_cpu_implementer STREQUAL "0xc0") # Ampere + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 if(_cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) @@ -625,7 +667,7 @@ macro(OFA_AutodetectArm) elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (M1) set(TARGET_ARCHITECTURE "apple-m1") endif() - endif() + endif() endmacro(OFA_AutodetectArm) macro(OFA_AutodetectPpc) @@ -662,6 +704,8 @@ endmacro(OFA_AutodetectHostArchitecture) macro(OFA_HandleX86Options) set(_march_flag_list) set(_available_vector_units_list) + + # Define macros for Intel macro(_nehalem) list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") @@ -772,6 +816,7 @@ macro(OFA_HandleX86Options) _goldmont_plus() endmacro() + # Intel if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") @@ -845,6 +890,8 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "atom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") + + # AMD elseif(TARGET_ARCHITECTURE STREQUAL "k8") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2") @@ -921,34 +968,46 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") + + # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") list(APPEND _march_flag_list "native") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause + else() message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) + # MSVC (on Windows) message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - # ICC (on Linux) - AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icl\\.exe)$") - # ICC (on Windows) - AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(pgcc|pgc\\+\\+)$") - # PGI (on Linux) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + if(WIN32) + # Intel (on Windows) + AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + # Intel (on Linux) + AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(suncc|sunCC)$") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") # Sun/Oracle Studio (on Linux/Sun OS) AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") else() + # Others: GNU, Clang and variants AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) @@ -1064,29 +1123,57 @@ macro(OFA_HandleX86Options) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) - set(OFA_map_knl "-xMIC-AVX512") - set(OFA_map_knm "-xMIC-AVX512") - set(OFA_map_sapphirerapids "-xCORE-AVX512") - set(OFA_map_alderlake "-xCORE-AVX512") - set(OFA_map_tigerlake "-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xCORE-AVX512") - set(OFA_map_icelake "-xCORE-AVX512") - set(OFA_map_cannonlake "-xCORE-AVX512") - set(OFA_map_cooperlake "-xCORE-AVX512") - set(OFA_map_cascadelake "-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xCORE-AVX512") - set(OFA_map_skylake "-xCORE-AVX2") - set(OFA_map_broadwell "-xCORE-AVX2") - set(OFA_map_haswell "-xCORE-AVX2") - set(OFA_map_ivybridge "-xCORE-AVX-I") - set(OFA_map_sandybridge "-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") # TEST ADVANCED KEYWORDS!!! + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxMIC-AVX512") + set(OFA_map_knm "-QxMIC-AVX512") + set(OFA_map_sapphirerapids "-QxCORE-AVX512") + set(OFA_map_alderlake "-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxCORE-AVX512") + set(OFA_map_icelake "-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxCORE-AVX512") + set(OFA_map_skylake "-QxCORE-AVX2") + set(OFA_map_broadwell "-QxCORE-AVX2") + set(OFA_map_haswell "-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xMIC-AVX512") + set(OFA_map_knm "-xMIC-AVX512") + set(OFA_map_sapphirerapids "-xCORE-AVX512") + set(OFA_map_alderlake "-xCORE-AVX512") + set(OFA_map_tigerlake "-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xCORE-AVX512") + set(OFA_map_icelake "-xCORE-AVX512") + set(OFA_map_cannonlake "-xCORE-AVX512") + set(OFA_map_cooperlake "-xCORE-AVX512") + set(OFA_map_cascadelake "-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xCORE-AVX512") + set(OFA_map_skylake "-xCORE-AVX2") + set(OFA_map_broadwell "-xCORE-AVX2") + set(OFA_map_haswell "-xCORE-AVX2") + set(OFA_map_ivybridge "-xCORE-AVX-I") + set(OFA_map_sandybridge "-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() foreach(arch ${_march_flag_list}) if(DEFINED OFA_map_${arch}) AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) @@ -1098,9 +1185,17 @@ macro(OFA_HandleX86Options) if(NOT _ok) # This is the Intel compiler, so SSE2 is a very reasonable baseline. message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2") - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(WIN32) + AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() endif() - else() # not MSVC and not ICC => GCC, Clang, Open64 + + # TEST PGI/Cray/SunPro ... + + else() + # Others: GNU, Clang and variants foreach(_flag ${_march_flag_list}) AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) if(_good) @@ -1169,14 +1264,17 @@ macro(OFA_HandleX86Options) # Compile code with profiling instrumentation if(TARGET_PROFILER STREQUAL "gprof") AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(TARGET_PROFILER STREQUAL "vtune" AND CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(TARGET_PROFILER STREQUAL "vtune") + if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Need to check if this also works on Windows + AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() endif() endmacro(OFA_HandleX86Options) @@ -1185,7 +1283,8 @@ macro(OFA_HandleArmOptions) set(_mtune_flag_list) set(_available_vector_units_list) - if(TARGET_ARCHITECTURE STREQUAL "strongarm") # ARM + # ARM + if(TARGET_ARCHITECTURE STREQUAL "strongarm") list(APPEND _mtune_flag_list "strongarm") list(APPEND _march_flag_list "armv4") elseif(TARGET_ARCHITECTURE STREQUAL "arm8") @@ -1330,6 +1429,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") list(APPEND _mtune_flag_list "cortex-a5") list(APPEND _march_flag_list "armv7-a") @@ -1469,30 +1576,51 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "dotprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") - list(APPEND _mtune_flag_list "cortex-r4") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") - list(APPEND _mtune_flag_list "cortex-r4f") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") - list(APPEND _mtune_flag_list "cortex-r5") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") - list(APPEND _mtune_flag_list "cortex-r7") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") - list(APPEND _mtune_flag_list "cortex-r8") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") - list(APPEND _mtune_flag_list "cortex-r52") - list(APPEND _march_flag_list "armv8-r") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") + list(APPEND _mtune_flag_list "cortex-a78ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") + list(APPEND _mtune_flag_list "cortex-a78c") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a510") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a710") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") list(APPEND _mtune_flag_list "cortex-m0") @@ -1535,22 +1663,85 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-m") list(APPEND _available_vector_units_list "mve.fp" "fp.dp") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") - list(APPEND _mtune_flag_list "neoverse-n1") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") + list(APPEND _mtune_flag_list "cortex-r4") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") + list(APPEND _mtune_flag_list "cortex-r4f") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") + list(APPEND _mtune_flag_list "cortex-r5") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") + list(APPEND _mtune_flag_list "cortex-r7") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") + list(APPEND _mtune_flag_list "cortex-r8") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") + list(APPEND _mtune_flag_list "cortex-r52") + list(APPEND _march_flag_list "armv8-r") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") + list(APPEND _mtune_flag_list "cortex-x1") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") list(APPEND _mtune_flag_list "neoverse-e1") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dorprod") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") + list(APPEND _mtune_flag_list "neoverse-n1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") + list(APPEND _mtune_flag_list "neoverse-n2") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") + list(APPEND _mtune_flag_list "neoverse-v1") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") # Broadcom + # Broadcom + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") list(APPEND _mtune_flag_list "brahma-b15") elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") list(APPEND _mtune_flag_list "brahma-b53") @@ -1561,7 +1752,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") # Cavium + # Cavium + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") list(APPEND _mtune_flag_list "thunderx") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") @@ -1588,14 +1780,16 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") # DEC + # DEC + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") list(APPEND _mtune_flag_list "strongarm110") list(APPEND _march_flag_list "armv4") elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") list(APPEND _mtune_flag_list "strongarm1100") list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") # FUJITSU + # FUJITSU + elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") list(APPEND _mtune_flag_list "a64fx") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") @@ -1603,7 +1797,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "sve") - elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") # HiSilicon + # HiSilicon + elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") list(APPEND _mtune_flag_list "tsv110") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") @@ -1611,7 +1806,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - elseif(TARGET_ARCHITECTURE STREQUAL "denver") # Nvidia + # Nvidia + elseif(TARGET_ARCHITECTURE STREQUAL "denver") list(APPEND _mtune_flag_list "denver") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") @@ -1629,12 +1825,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") # APM + # APM + elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") list(APPEND _mtune_flag_list "xgene1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") # Qualcomm + # Qualcomm + elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") list(APPEND _mtune_flag_list "scorpion") list(APPEND _march_flag_list "armv7-a") elseif(TARGET_ARCHITECTURE STREQUAL "krait") @@ -1665,13 +1863,15 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") # Samsung + # Samsung + elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") list(APPEND _mtune_flag_list "exynos-m1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") # Marvell + # Marvell + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") list(APPEND _mtune_flag_list "marvell-f") list(APPEND _march_flag_list "armv5te") elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") @@ -1679,7 +1879,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "mp" "sec" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "i80200") # Intel + # Intel + elseif(TARGET_ARCHITECTURE STREQUAL "i80200") list(APPEND _mtune_flag_list "i80200") elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") list(APPEND _mtune_flag_list "pxa250a") @@ -1722,13 +1923,14 @@ macro(OFA_HandleArmOptions) elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") list(APPEND _mtune_flag_list "ipx1200") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") # Apple + # Apple + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") list(APPEND _mtune_flag_list "apple-a6") list(APPEND _march_flag_list "armv7-a") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") list(APPEND _mtune_flag_list "apple-a7") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") list(APPEND _mtune_flag_list "apple-a8") list(APPEND _march_flag_list "armv8-a") @@ -1747,14 +1949,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") list(APPEND _mtune_flag_list "apple-a12") list(APPEND _march_flag_list "armv8.3-a") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") list(APPEND _mtune_flag_list "apple-a13") list(APPEND _march_flag_list "armv8.4-a") @@ -1762,7 +1964,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") list(APPEND _mtune_flag_list "vortex") list(APPEND _march_flag_list "armv8.3-a") @@ -1770,13 +1972,15 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause - + else() message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() @@ -1788,133 +1992,153 @@ macro(OFA_HandleArmOptions) # set the -march and -mtune flags as fallback option. if(TARGET_ARCHITECTURE STREQUAL "native") - AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR, "MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - foreach(_flag ${_mtune_flag_list}) - - AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - - if(NOT _good) - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - foreach(_march ${_march_flag_list}) - - AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") - endif(_good) - endforeach(_flag) - AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - break() - endif(_good) - endforeach(_march) - foreach(_mtune ${_mtune_flag_list}) - AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) + if(MSVC) + # MSVC on ARM64 needs to be done + + else() + # Others: GNU, Clang and variants + foreach(_flag ${_mtune_flag_list}) + AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) break() - endif(_good) - endforeach(_mtune) - endif(NOT _good) + endif(_good) + endforeach(_flag) + + if(NOT _good) + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + set(_aes_broken false) + set(_bf16_broken false) + set(_crc_broken false) + set(_crypto_broken false) + set(_dotprod_broken false) + set(_dsp_broken false) + set(_fp16_broken false) + set(_fp16fml_broken false) + set(_fp_broken false) + set(_fp_dp_broken false) + set(_fp_sp_broken false) + set(_i8mm_broken false) + set(_idiv_broken false) + set(_lse_broken false) + set(_mve_broken false) + set(_mve_fp_broken false) + set(_neon_broken false) + set(_neon_fp16_broken false) + set(_neon_vfpv4_broken false) + set(_ras_broken false) + set(_rcpc_broken false) + set(_rdm_broken false) + set(_rdma_broken false) + set(_sec_broken false) + set(_sha2_broken false) + set(_sha3_broken false) + set(_simd_broken false) + set(_sm4_broken false) + set(_sve_broken false) + set(_vfpv3_broken false) + set(_vfpv3_d16_broken false) + set(_vfpv3_d16_fp16_broken false) + set(_vfpv3_fp16_broken false) + set(_vfpv4_broken false) + set(_vfpv4_d16_broken false) + set(_zcm_broken false) + set(_zcz_broken false) + + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) + _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) + _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) + _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) + _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) + _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) + _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) + _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) + _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) + _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) + _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) + _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) + _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) + _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) + _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) + _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) + _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) + _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) + _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) + _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) + _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) + _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) + _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) + _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) + _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) + _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) + _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) + _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) + _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) + _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) + _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) + _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) + _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) + _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) + _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) + _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) + _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) + foreach(_march ${_march_flag_list}) + + AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march}") + foreach(_flag ${_enable_vector_unit_list}) + AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") + endif(_good) + endforeach(_flag) + AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + break() + endif(_good) + endforeach(_march) + + foreach(_mtune ${_mtune_flag_list}) + AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_mtune) + endif(NOT _good) + endif() endif() # Compile code with profiling instrumentation @@ -1960,10 +2184,11 @@ macro(OFA_HandlePpcOptions) endmacro(OFA_HandlePpcOptions) macro(OptimizeForArchitecture) + message(STATUS "Detecting target architecture optimization") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a78\", \"cortex-a78ae\", \"cortex-a76c\", \"cortex-a510\", \"cortex-a710\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"cortex-x1\", \"cortex-x2\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"neoverse-n2\", \"neoverse-v1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Other supported values are: \"none\", \"generic\", \"power8\", \"power9\", \"power10\".") else() @@ -1971,7 +2196,7 @@ macro(OptimizeForArchitecture) endif() set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") - message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") + message(STATUS "Target architecture changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") set(_force FORCE) endif() set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) @@ -1990,4 +2215,5 @@ macro(OptimizeForArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_HandlePpcOptions() endif() + message(STATUS "Detecting target architecture optimization - done") endmacro(OptimizeForArchitecture) From 442a7eb5c6174299d39791de33cbbde923acbcd3 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Thu, 13 Jan 2022 16:52:10 +0100 Subject: [PATCH 133/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 738 ++++++++++++++++++---------- 1 file changed, 487 insertions(+), 251 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 6554933958..36dc0c5957 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -100,15 +100,21 @@ macro(OFA_AutodetectX86) string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _vendor_id) - list(GET _sysctl_output 1 _cpu_family) - list(GET _sysctl_output 2 _cpu_model) - list(GET _sysctl_output 3 _cpu_stepping) - list(GET _sysctl_output 4 _cpu_flags) - string(TOLOWER "${_cpu_flags}" _cpu_flags) - string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _vendor_id) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_stepping) + list(GET _sysctl_output 4 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + endif() + if(_error) + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) @@ -163,12 +169,15 @@ macro(OFA_AutodetectX86) # 3C | Haswell # # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid - if(_cpu_model EQUAL 133) # 85 + + # MIC architecture + if(_cpu_model EQUAL 133) set(TARGET_ARCHITECTURE "knm") # Knights Mill - elseif(_cpu_model EQUAL 87) # 57 + elseif(_cpu_model EQUAL 87) set(TARGET_ARCHITECTURE "knl") # Knights Landing + # Small cores elseif(_cpu_model EQUAL 134) set(TARGET_ARCHITECTURE "tremont") @@ -184,28 +193,29 @@ macro(OFA_AutodetectX86) elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) set(TARGET_ARCHITECTURE "bonnell") - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "sapphirerapids") + # Big cores + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) + set(TARGET_ARCHITECTURE "alderlake") - # elseif(_cpu_model EQUAL X) - # set(TARGET_ARCHITECTURE "alderlake") + elseif(_cpu_model EQUAL 143) + set(TARGET_ARCHITECTURE "sapphirerapids") + + elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) + set(TARGET_ARCHITECTURE "kabylake") elseif(_cpu_model EQUAL 140) set(TARGET_ARCHITECTURE "tigerlake") + elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) + set(TARGET_ARCHITECTURE "icelake") + elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) set(TARGET_ARCHITECTURE "icelake-avx512") - elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) - set(TARGET_ARCHITECTURE "icelake") - elseif(_cpu_model EQUAL 102) set(TARGET_ARCHITECTURE "cannonlake") - elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) # 8E, 9E - set(TARGET_ARCHITECTURE "kabylake") - - elseif(_cpu_model EQUAL 85) # 55 + elseif(_cpu_model EQUAL 85) if(_cpu_stepping LESS 5) set(TARGET_ARCHITECTURE "skylake-avx512") elseif(_cpu_stepping LESS 8) @@ -214,10 +224,10 @@ macro(OFA_AutodetectX86) set(TARGET_ARCHITECTURE "cooperlake") endif() - elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) # 4E, 5E + elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) set(TARGET_ARCHITECTURE "skylake") - elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) # 3D, 47, 4F, 56 + elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) set(TARGET_ARCHITECTURE "broadwell") elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) @@ -284,13 +294,13 @@ macro(OFA_AutodetectX86) # 25 19h | Zen 3 if(_cpu_family EQUAL 25) # 19h - set(TARGET_ARCHITECTURE "zen3") # planned + set(TARGET_ARCHITECTURE "zen3") elseif(_cpu_family EQUAL 24) # 18h set(TARGET_ARCHITECTURE "zen") elseif(_cpu_family EQUAL 23) # 17h - if(_cpu_model LESS 64) + if(_cpu_model LESS 49) set(TARGET_ARCHITECTURE "zen") else() set(TARGET_ARCHITECTURE "zen2") @@ -349,13 +359,19 @@ macro(OFA_AutodetectArm) string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" OUTPUT_VARIABLE _sysctl_output_string) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _cpu_implementer) - list(GET _sysctl_output 1 _cpu_architecture) - list(GET _sysctl_output 2 _cpu_variant) - list(GET _sysctl_output 3 _cpu_part) - list(GET _sysctl_output 4 _cpu_revision) + exec_program("/usr/sbin/sysctl -n -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_architecture) + list(GET _sysctl_output 2 _cpu_variant) + list(GET _sysctl_output 3 _cpu_part) + list(GET _sysctl_output 4 _cpu_revision) + endif() + if(_error) + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") @@ -456,8 +472,26 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "cortex-m23") elseif(_cpu_part STREQUAL "0xd21") set(TARGET_ARCHITECTURE "cortex-m33") + elseif(_cpu_part STREQUAL "0xd40") + set(TARGET_ARCHITECTURE "neoverse-v1") + elseif(_cpu_part STREQUAL "0xd41") + set(TARGET_ARCHITECTURE "cortex-a78") + elseif(_cpu_part STREQUAL "0xd42") + set(TARGET_ARCHITECTURE "cortex-a78ae") + elseif(_cpu_part STREQUAL "0xd44") + set(TARGET_ARCHITECTURE "cortex-x1") + elseif(_cpu_part STREQUAL "0xd46") + set(TARGET_ARCHITECTURE "cortex-a510") + elseif(_cpu_part STREQUAL "0xd47") + set(TARGET_ARCHITECTURE "cortex-a710") + elseif(_cpu_part STREQUAL "0xd48") + set(TARGET_ARCHITECTURE "cortex-x2") + elseif(_cpu_part STREQUAL "0xd49") + set(TARGET_ARCHITECTURE "neoverse-n2") elseif(_cpu_part STREQUAL "0xd4a") set(TARGET_ARCHITECTURE "neoverse-e1") + elseif(_cpu_part STREQUAL "0xd4b") + set(TARGET_ARCHITECTURE "cortex-a78c") endif() elseif(_cpu_implementer STREQUAL "0x42") # Broadcom @@ -499,6 +533,10 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "tsv110") endif() + elseif(_cpu_implementer STREQUAL "0x49") # Infineon + + elseif(_cpu_implementer STREQUAL "0x4d") # Motorola/Freescale + elseif(_cpu_implementer STREQUAL "0x4e") # Nvidia if(_cpu_part STREQUAL "0x000") set(TARGET_ARCHITECTURE "denver") @@ -552,6 +590,13 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "marvell-pj4") endif() + elseif(_cpu_implementer STREQUAL "0x61") # Apple + if(_cpu_part STREQUAL "0x022") + set(TARGET_ARCHITECTURE "icestorm") + elseif(_cpu_part STREQUAL "0x023") + set(TARGET_ARCHITECTURE "firestorm") + endif() + elseif(_cpu_implementer STREQUAL "0x66") # Faraday if(_cpu_part STREQUAL "0x526") set(TARGET_ARCHITECTURE "fa526") @@ -604,6 +649,15 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "ipx1200") endif() + elseif(_cpu_implementer STREQUAL "0x70") # Phytium + if(_cpu_part STREQUAL "0x662") + set(TARGET_ARCHITECTURE "ftc662") + elseif(_cpu_part STREQUAL "0x663") + set(TARGET_ARCHITECTURE "ftc663") + endif() + + elseif(_cpu_implementer STREQUAL "0xc0") # Ampere + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 if(_cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) @@ -625,7 +679,7 @@ macro(OFA_AutodetectArm) elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (M1) set(TARGET_ARCHITECTURE "apple-m1") endif() - endif() + endif() endmacro(OFA_AutodetectArm) macro(OFA_AutodetectPpc) @@ -655,13 +709,15 @@ macro(OFA_AutodetectHostArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_AutodetectPpc() else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture.cmake") endif() endmacro(OFA_AutodetectHostArchitecture) macro(OFA_HandleX86Options) set(_march_flag_list) set(_available_vector_units_list) + + # Define macros for Intel macro(_nehalem) list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") @@ -772,6 +828,7 @@ macro(OFA_HandleX86Options) _goldmont_plus() endmacro() + # Intel if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3") @@ -845,6 +902,8 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "atom") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") + + # AMD elseif(TARGET_ARCHITECTURE STREQUAL "k8") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2") @@ -921,34 +980,46 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "barcelona") list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") + + # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") list(APPEND _march_flag_list "native") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause + else() message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) + # MSVC (on Windows) message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - # ICC (on Linux) - AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icl\\.exe)$") - # ICC (on Windows) - AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(pgcc|pgc\\+\\+)$") - # PGI (on Linux) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + if(WIN32) + # Intel (on Windows) + AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + # Intel (on Linux) + AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER MATCHES "/(suncc|sunCC)$") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") # Sun/Oracle Studio (on Linux/Sun OS) AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") else() + # Others: GNU, Clang and variants AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) @@ -1064,29 +1135,57 @@ macro(OFA_HandleX86Options) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) - elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux) - set(OFA_map_knl "-xMIC-AVX512") - set(OFA_map_knm "-xMIC-AVX512") - set(OFA_map_sapphirerapids "-xCORE-AVX512") - set(OFA_map_alderlake "-xCORE-AVX512") - set(OFA_map_tigerlake "-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xCORE-AVX512") - set(OFA_map_icelake "-xCORE-AVX512") - set(OFA_map_cannonlake "-xCORE-AVX512") - set(OFA_map_cooperlake "-xCORE-AVX512") - set(OFA_map_cascadelake "-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xCORE-AVX512") - set(OFA_map_skylake "-xCORE-AVX2") - set(OFA_map_broadwell "-xCORE-AVX2") - set(OFA_map_haswell "-xCORE-AVX2") - set(OFA_map_ivybridge "-xCORE-AVX-I") - set(OFA_map_sandybridge "-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") # TEST ADVANCED KEYWORDS!!! + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxMIC-AVX512") + set(OFA_map_knm "-QxMIC-AVX512") + set(OFA_map_sapphirerapids "-QxCORE-AVX512") + set(OFA_map_alderlake "-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxCORE-AVX512") + set(OFA_map_icelake "-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxCORE-AVX512") + set(OFA_map_skylake "-QxCORE-AVX2") + set(OFA_map_broadwell "-QxCORE-AVX2") + set(OFA_map_haswell "-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xMIC-AVX512") + set(OFA_map_knm "-xMIC-AVX512") + set(OFA_map_sapphirerapids "-xCORE-AVX512") + set(OFA_map_alderlake "-xCORE-AVX512") + set(OFA_map_tigerlake "-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xCORE-AVX512") + set(OFA_map_icelake "-xCORE-AVX512") + set(OFA_map_cannonlake "-xCORE-AVX512") + set(OFA_map_cooperlake "-xCORE-AVX512") + set(OFA_map_cascadelake "-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xCORE-AVX512") + set(OFA_map_skylake "-xCORE-AVX2") + set(OFA_map_broadwell "-xCORE-AVX2") + set(OFA_map_haswell "-xCORE-AVX2") + set(OFA_map_ivybridge "-xCORE-AVX-I") + set(OFA_map_sandybridge "-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() foreach(arch ${_march_flag_list}) if(DEFINED OFA_map_${arch}) AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) @@ -1098,9 +1197,17 @@ macro(OFA_HandleX86Options) if(NOT _ok) # This is the Intel compiler, so SSE2 is a very reasonable baseline. message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2") - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(WIN32) + AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() endif() - else() # not MSVC and not ICC => GCC, Clang, Open64 + + # TEST PGI/Cray/SunPro ... + + else() + # Others: GNU, Clang and variants foreach(_flag ${_march_flag_list}) AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) if(_good) @@ -1169,14 +1276,17 @@ macro(OFA_HandleX86Options) # Compile code with profiling instrumentation if(TARGET_PROFILER STREQUAL "gprof") AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(TARGET_PROFILER STREQUAL "vtune" AND CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") - AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(TARGET_PROFILER STREQUAL "vtune") + if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Need to check if this also works on Windows + AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() endif() endmacro(OFA_HandleX86Options) @@ -1185,7 +1295,8 @@ macro(OFA_HandleArmOptions) set(_mtune_flag_list) set(_available_vector_units_list) - if(TARGET_ARCHITECTURE STREQUAL "strongarm") # ARM + # ARM + if(TARGET_ARCHITECTURE STREQUAL "strongarm") list(APPEND _mtune_flag_list "strongarm") list(APPEND _march_flag_list "armv4") elseif(TARGET_ARCHITECTURE STREQUAL "arm8") @@ -1330,6 +1441,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") list(APPEND _mtune_flag_list "cortex-a5") list(APPEND _march_flag_list "armv7-a") @@ -1469,30 +1588,51 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "dotprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") - list(APPEND _mtune_flag_list "cortex-r4") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") - list(APPEND _mtune_flag_list "cortex-r4f") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") - list(APPEND _mtune_flag_list "cortex-r5") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") - list(APPEND _mtune_flag_list "cortex-r7") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") - list(APPEND _mtune_flag_list "cortex-r8") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") - list(APPEND _mtune_flag_list "cortex-r52") - list(APPEND _march_flag_list "armv8-r") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") + list(APPEND _mtune_flag_list "cortex-a78ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") + list(APPEND _mtune_flag_list "cortex-a78c") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a510") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a710") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") list(APPEND _mtune_flag_list "cortex-m0") @@ -1535,22 +1675,85 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-m") list(APPEND _available_vector_units_list "mve.fp" "fp.dp") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") - list(APPEND _mtune_flag_list "neoverse-n1") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") + list(APPEND _mtune_flag_list "cortex-r4") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") + list(APPEND _mtune_flag_list "cortex-r4f") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") + list(APPEND _mtune_flag_list "cortex-r5") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") + list(APPEND _mtune_flag_list "cortex-r7") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") + list(APPEND _mtune_flag_list "cortex-r8") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") + list(APPEND _mtune_flag_list "cortex-r52") + list(APPEND _march_flag_list "armv8-r") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") + list(APPEND _mtune_flag_list "cortex-x1") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") list(APPEND _mtune_flag_list "neoverse-e1") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dorprod") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") + list(APPEND _mtune_flag_list "neoverse-n1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") + list(APPEND _mtune_flag_list "neoverse-n2") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") + list(APPEND _mtune_flag_list "neoverse-v1") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") # Broadcom + # Broadcom + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") list(APPEND _mtune_flag_list "brahma-b15") elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") list(APPEND _mtune_flag_list "brahma-b53") @@ -1561,7 +1764,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") # Cavium + # Cavium + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") list(APPEND _mtune_flag_list "thunderx") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") @@ -1588,14 +1792,16 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") # DEC + # DEC + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") list(APPEND _mtune_flag_list "strongarm110") list(APPEND _march_flag_list "armv4") elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") list(APPEND _mtune_flag_list "strongarm1100") list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") # FUJITSU + # FUJITSU + elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") list(APPEND _mtune_flag_list "a64fx") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") @@ -1603,7 +1809,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "sve") - elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") # HiSilicon + # HiSilicon + elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") list(APPEND _mtune_flag_list "tsv110") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") @@ -1611,7 +1818,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - elseif(TARGET_ARCHITECTURE STREQUAL "denver") # Nvidia + # Nvidia + elseif(TARGET_ARCHITECTURE STREQUAL "denver") list(APPEND _mtune_flag_list "denver") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") @@ -1629,12 +1837,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") # APM + # APM + elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") list(APPEND _mtune_flag_list "xgene1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") # Qualcomm + # Qualcomm + elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") list(APPEND _mtune_flag_list "scorpion") list(APPEND _march_flag_list "armv7-a") elseif(TARGET_ARCHITECTURE STREQUAL "krait") @@ -1665,13 +1875,15 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") # Samsung + # Samsung + elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") list(APPEND _mtune_flag_list "exynos-m1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") # Marvell + # Marvell + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") list(APPEND _mtune_flag_list "marvell-f") list(APPEND _march_flag_list "armv5te") elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") @@ -1679,7 +1891,8 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "mp" "sec" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "i80200") # Intel + # Intel + elseif(TARGET_ARCHITECTURE STREQUAL "i80200") list(APPEND _mtune_flag_list "i80200") elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") list(APPEND _mtune_flag_list "pxa250a") @@ -1722,13 +1935,14 @@ macro(OFA_HandleArmOptions) elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") list(APPEND _mtune_flag_list "ipx1200") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") # Apple + # Apple + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") list(APPEND _mtune_flag_list "apple-a6") list(APPEND _march_flag_list "armv7-a") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") list(APPEND _mtune_flag_list "apple-a7") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") list(APPEND _mtune_flag_list "apple-a8") list(APPEND _march_flag_list "armv8-a") @@ -1747,14 +1961,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") list(APPEND _mtune_flag_list "apple-a12") list(APPEND _march_flag_list "armv8.3-a") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") list(APPEND _mtune_flag_list "apple-a13") list(APPEND _march_flag_list "armv8.4-a") @@ -1762,7 +1976,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") list(APPEND _mtune_flag_list "vortex") list(APPEND _march_flag_list "armv8.3-a") @@ -1770,13 +1984,15 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause - + else() message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() @@ -1786,135 +2002,155 @@ macro(OFA_HandleArmOptions) # we first try to only use the -mcpu flag. If that fails, e.g., of # the compiler does not yet support the specified target, we try to # set the -march and -mtune flags as fallback option. - + if(TARGET_ARCHITECTURE STREQUAL "native") - AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR, "MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - foreach(_flag ${_mtune_flag_list}) - - AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - - if(NOT _good) - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - foreach(_march ${_march_flag_list}) - - AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") - endif(_good) - endforeach(_flag) - AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - break() - endif(_good) - endforeach(_march) - foreach(_mtune ${_mtune_flag_list}) - AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) + if(MSVC) + # MSVC on ARM64 needs to be done + + else() + # Others: GNU, Clang and variants + foreach(_flag ${_mtune_flag_list}) + AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) break() - endif(_good) - endforeach(_mtune) - endif(NOT _good) + endif(_good) + endforeach(_flag) + + if(NOT _good) + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + set(_aes_broken false) + set(_bf16_broken false) + set(_crc_broken false) + set(_crypto_broken false) + set(_dotprod_broken false) + set(_dsp_broken false) + set(_fp16_broken false) + set(_fp16fml_broken false) + set(_fp_broken false) + set(_fp_dp_broken false) + set(_fp_sp_broken false) + set(_i8mm_broken false) + set(_idiv_broken false) + set(_lse_broken false) + set(_mve_broken false) + set(_mve_fp_broken false) + set(_neon_broken false) + set(_neon_fp16_broken false) + set(_neon_vfpv4_broken false) + set(_ras_broken false) + set(_rcpc_broken false) + set(_rdm_broken false) + set(_rdma_broken false) + set(_sec_broken false) + set(_sha2_broken false) + set(_sha3_broken false) + set(_simd_broken false) + set(_sm4_broken false) + set(_sve_broken false) + set(_vfpv3_broken false) + set(_vfpv3_d16_broken false) + set(_vfpv3_d16_fp16_broken false) + set(_vfpv3_fp16_broken false) + set(_vfpv4_broken false) + set(_vfpv4_d16_broken false) + set(_zcm_broken false) + set(_zcz_broken false) + + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) + _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) + _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) + _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) + _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) + _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) + _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) + _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) + _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) + _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) + _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) + _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) + _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) + _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) + _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) + _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) + _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) + _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) + _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) + _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) + _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) + _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) + _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) + _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) + _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) + _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) + _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) + _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) + _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) + _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) + _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) + _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) + _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) + _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) + _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) + _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) + _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) + foreach(_march ${_march_flag_list}) + + AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march}") + foreach(_flag ${_enable_vector_unit_list}) + AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") + endif(_good) + endforeach(_flag) + AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + break() + endif(_good) + endforeach(_march) + + foreach(_mtune ${_mtune_flag_list}) + AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_mtune) + endif(NOT _good) + endif() endif() # Compile code with profiling instrumentation @@ -1963,7 +2199,7 @@ macro(OptimizeForArchitecture) if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a78\", \"cortex-a78ae\", \"cortex-a76c\", \"cortex-a510\", \"cortex-a710\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"cortex-x1\", \"cortex-x2\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"neoverse-n2\", \"neoverse-v1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Other supported values are: \"none\", \"generic\", \"power8\", \"power9\", \"power10\".") else() @@ -1971,7 +2207,7 @@ macro(OptimizeForArchitecture) endif() set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") - message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") + message(STATUS "Target architecture changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") set(_force FORCE) endif() set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE) From 8e64aaeb7c061edebf6e6e46df670eab4f3c5e41 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 13 Jan 2022 19:14:15 +0100 Subject: [PATCH 134/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 0bc7f984ad..daed1d1b11 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -2197,7 +2197,6 @@ macro(OFA_HandlePpcOptions) endmacro(OFA_HandlePpcOptions) macro(OptimizeForArchitecture) - message(STATUS "Detecting target architecture optimization") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") @@ -2228,5 +2227,4 @@ macro(OptimizeForArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_HandlePpcOptions() endif() - message(STATUS "Detecting target architecture optimization - done") endmacro(OptimizeForArchitecture) From 6c9d4d5d24b51b29c53e12086cb30d27e5e717e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 13 Jan 2022 21:28:28 +0100 Subject: [PATCH 135/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 71 +++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index daed1d1b11..a5883e1e83 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -92,6 +92,7 @@ macro(OFA_AutodetectX86) set(_cpu_family) set(_cpu_model) set(_cpu_stepping) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") @@ -111,6 +112,57 @@ macro(OFA_AutodetectX86) list(GET _sysctl_output 4 _cpu_flags) string(TOLOWER "${_cpu_flags}" _cpu_flags) string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + else() + # Apple Silicon (ARM64) running in Rosetta 2 mode + exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + + # Fake vendor + if(_cpu_implementer STREQUAL "0x7" OR _cpu_implementer STREQUAL "7") + set(_vendor_id "GenuineIntel") + else() + set(_vendor_id "Unknown") + endif() + + # Fake stepping + set(_cpu_stepping "Unknown") + + # Fake model + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + if( _cpu_model STREQUAL "0x78ea4fbc" OR _cpu_model STREQUAL "2028621756") # Penryn + set(_cpu_model "23") + elseif(_cpu_model STREQUAL "0x6b5a4cd2" OR _cpu_model STREQUAL "1801080018") # Nehalem + set(_cpu_model "26") + elseif(_cpu_model STREQUAL "0x573b5eec" OR _cpu_model STREQUAL "1463508716") # Westmere + set(_cpu_model "37") + elseif(_cpu_model STREQUAL "0x5490b78c" OR _cpu_model STREQUAL "1418770316") # Sandybridge + set(_cpu_model "42") + elseif(_cpu_model STREQUAL "0x1f65e835" OR _cpu_model STREQUAL "526772277") # Ivybridge + set(_cpu_model "58") + elseif(_cpu_model STREQUAL "0x10b282dc" OR _cpu_model STREQUAL "280134364") # Haswell + set(_cpu_model "60") + elseif(_cpu_model STREQUAL "0x582ed09c" OR _cpu_model STREQUAL "1479463068") # Broadwell + set(_cpu_model "61") + elseif(_cpu_model STREQUAL "0x37fc219f" OR _cpu_model STREQUAL "939270559") # Skylake + set(_cpu_model "78") + elseif(_cpu_model STREQUAL "0x0f817246" OR _cpu_model STREQUAL "260141638") # Kabylake + set(_cpu_model "142") + elseif(_cpu_model STREQUAL "0x38435547" OR _cpu_model STREQUAL "943936839") # Icelake + set(_cpu_model "125") + elseif(_cpu_model STREQUAL "0x1cf8a03e" OR _cpu_model STREQUAL "486055998") # Cometlake + set(_cpu_model "142") + else() + set(_cpu_model "Unknown") + endif() + endif() endif() if(_error) message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") @@ -193,7 +245,10 @@ macro(OFA_AutodetectX86) elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) set(TARGET_ARCHITECTURE "bonnell") - # Big cores + # Big cores + elseif(_cpu_model EQUAL 167) + set(TARGET_ARCHITECTURE "rocketlake") + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) set(TARGET_ARCHITECTURE "alderlake") @@ -359,8 +414,9 @@ macro(OFA_AutodetectArm) string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" + exec_program("/usr/sbin/sysctl -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + message(${_sysctl_output_string}) if(NOT _error) string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) list(GET _sysctl_output 0 _cpu_implementer) @@ -658,9 +714,9 @@ macro(OFA_AutodetectArm) elseif(_cpu_implementer STREQUAL "0xc0") # Ampere - # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX11.1.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 - if(_cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) + if( _cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) set(TARGET_ARCHITECTURE "apple-a6") elseif(_cpu_part STREQUAL "0x37a09642" OR _cpu_part STREQUAL "933271106") # Cyclone (A7) set(TARGET_ARCHITECTURE "apple-a7") @@ -676,8 +732,9 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "apple-a12") elseif(_cpu_part STREQUAL "0x462504d2" OR _cpu_part STREQUAL "1176831186") # Lightning Thunder (A13) set(TARGET_ARCHITECTURE "apple-a13") - elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (M1) + elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (A14 / M1 / M1 Pro / M1 Max) set(TARGET_ARCHITECTURE "apple-m1") + elseif(_cpu_part STREQUAL "0xda33d83d" OR _cpu_part STREQUAL "3660830781") # Blizzard Avalanche (A15) endif() endif() endmacro(OFA_AutodetectArm) @@ -798,6 +855,10 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "sapphirerapids") _icelake_avx512() endmacro() + macro(_rocketlake) + list(APPEND _march_flag_list "rocketlake") + _icelake_avx512() + endmacro() macro(_knightslanding) list(APPEND _march_flag_list "knl") _broadwell() From b8c6e2af642c125b7d2c6250271b491259118e6e Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 13 Jan 2022 23:47:57 +0100 Subject: [PATCH 136/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 160 ++++++++++++++++------------ 1 file changed, 90 insertions(+), 70 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index a5883e1e83..4c01179fe7 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -92,7 +92,7 @@ macro(OFA_AutodetectX86) set(_cpu_family) set(_cpu_model) set(_cpu_stepping) - + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") @@ -114,7 +114,7 @@ macro(OFA_AutodetectX86) string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") else() # Apple Silicon (ARM64) running in Rosetta 2 mode - exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" + exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) if(NOT _error) string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) @@ -134,7 +134,7 @@ macro(OFA_AutodetectX86) # Fake stepping set(_cpu_stepping "Unknown") - + # Fake model # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h if( _cpu_model STREQUAL "0x78ea4fbc" OR _cpu_model STREQUAL "2028621756") # Penryn @@ -248,19 +248,19 @@ macro(OFA_AutodetectX86) # Big cores elseif(_cpu_model EQUAL 167) set(TARGET_ARCHITECTURE "rocketlake") - + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) set(TARGET_ARCHITECTURE "alderlake") elseif(_cpu_model EQUAL 143) set(TARGET_ARCHITECTURE "sapphirerapids") - + elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) set(TARGET_ARCHITECTURE "kabylake") - + elseif(_cpu_model EQUAL 140) set(TARGET_ARCHITECTURE "tigerlake") - + elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) set(TARGET_ARCHITECTURE "icelake") @@ -779,23 +779,24 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2") + list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") endmacro() macro(_westmere) list(APPEND _march_flag_list "westmere") _nehalem() + list(APPEND _available_vector_units_list "aes" "pclmul") endmacro() macro(_sandybridge) list(APPEND _march_flag_list "sandybridge") list(APPEND _march_flag_list "corei7-avx") _westmere() - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx") + list(APPEND _available_vector_units_list "avx") endmacro() macro(_ivybridge) list(APPEND _march_flag_list "ivybridge") list(APPEND _march_flag_list "core-avx-i") _sandybridge() - list(APPEND _available_vector_units_list "rdrnd" "f16c") + list(APPEND _available_vector_units_list "rdrnd" "f16c" "fsgsbase") endmacro() macro(_haswell) list(APPEND _march_flag_list "haswell") @@ -806,16 +807,17 @@ macro(OFA_HandleX86Options) macro(_broadwell) list(APPEND _march_flag_list "broadwell") _haswell() - list(APPEND _available_vector_units_list "rdseed") + list(APPEND _available_vector_units_list "rdseed" "adcx" "prefetchw") endmacro() macro(_skylake) list(APPEND _march_flag_list "skylake") _broadwell() + list(APPEND _available_vector_units_list "clflushopt" "xsavec" "xsaves") endmacro() macro(_skylake_avx512) list(APPEND _march_flag_list "skylake-avx512") _skylake() - list(APPEND _available_vector_units_list "avx512f" "avx512cd" "avx512dq" "avx512bw" "avx512vl") + list(APPEND _available_vector_units_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") endmacro() macro(_cascadelake) list(APPEND _march_flag_list "cascadelake") @@ -824,40 +826,43 @@ macro(OFA_HandleX86Options) endmacro() macro(_cooperlake) list(APPEND _march_flag_list "cooperlake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bf16" "avx512vnni") + _cascadelake() + list(APPEND _available_vector_units_list "avx512bf16") endmacro() macro(_cannonlake) list(APPEND _march_flag_list "cannonlake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512ifma" "avx512vbmi") + _skylake() + list(APPEND _available_vector_units_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") endmacro() macro(_icelake) list(APPEND _march_flag_list "icelake-client") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi2" "avx512vbmi" "avx512vnni" "avx512vpopcntdq" "rdpid") + _cannonlake() + list(APPEND _available_vector_units_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") endmacro() macro(_icelake_avx512) list(APPEND _march_flag_list "icelake-server") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi2" "avx512vbmi" "avx512vnni" "avx512vpopcntdq" "rdpid") + _icelake() + list(APPEND _available_vector_units_list "pconfig" "wbnoinvd") endmacro() macro(_tigerlake) list(APPEND _march_flag_list "tigerlake") _icelake() - list(APPEND _available_vector_units_list "avx512vp2intersect") + list(APPEND _available_vector_units_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") endmacro() macro(_alderlake) list(APPEND _march_flag_list "alderlake") - _tigerlake() + _broadwell() + list(APPEND _available_vector_units_list "avx-vnni" "cldemote" "clwb" "gfni-sse" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") endmacro() macro(_sapphirerapids) list(APPEND _march_flag_list "sapphirerapids") - _icelake_avx512() + _skylake_avx512() + list(APPEND _available_vector_units_list "amx-bf16" "amx-int8" "amx-tile" "avx-vnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") endmacro() macro(_rocketlake) list(APPEND _march_flag_list "rocketlake") - _icelake_avx512() + _skylake_avx512() + list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") endmacro() macro(_knightslanding) list(APPEND _march_flag_list "knl") @@ -912,6 +917,8 @@ macro(OFA_HandleX86Options) _knightsmill() elseif(TARGET_ARCHITECTURE STREQUAL "knl") _knightslanding() + elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") + _rocketlake() elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") _sapphirerapids() elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") @@ -1049,7 +1056,7 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "native") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause - + else() message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() @@ -1058,7 +1065,8 @@ macro(OFA_HandleX86Options) if(MSVC) # MSVC (on Windows) message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") if(WIN32) # Intel (on Windows) AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) @@ -1080,7 +1088,7 @@ macro(OFA_HandleX86Options) # Others: GNU, Clang and variants AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() - + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_disable_vector_unit_list) set(_enable_vector_unit_list) @@ -1196,26 +1204,30 @@ macro(OFA_HandleX86Options) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") # TEST ADVANCED KEYWORDS!!! + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") if(WIN32) # Intel (on Windows) - set(OFA_map_knl "-QxMIC-AVX512") - set(OFA_map_knm "-QxMIC-AVX512") - set(OFA_map_sapphirerapids "-QxCORE-AVX512") - set(OFA_map_alderlake "-QxCORE-AVX512") - set(OFA_map_tigerlake "-QxCORE-AVX512") - set(OFA_map_icelake-avx512 "-QxCORE-AVX512") - set(OFA_map_icelake "-QxCORE-AVX512") - set(OFA_map_cannonlake "-QxCORE-AVX512") - set(OFA_map_cooperlake "-QxCORE-AVX512") - set(OFA_map_cascadelake "-QxCORE-AVX512") - set(OFA_map_skylake-avx512 "-QxCORE-AVX512") - set(OFA_map_skylake "-QxCORE-AVX2") - set(OFA_map_broadwell "-QxCORE-AVX2") - set(OFA_map_haswell "-QxCORE-AVX2") - set(OFA_map_ivybridge "-QxCORE-AVX-I") - set(OFA_map_sandybridge "-QxAVX") + set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") + set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") + set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") + set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") + set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") + set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") + set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") + set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") + set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") set(OFA_map_westmere "-QxSSE4.2") set(OFA_map_nehalem "-QxSSE4.2") set(OFA_map_penryn "-QxSSSE3") @@ -1224,22 +1236,25 @@ macro(OFA_HandleX86Options) set(_ok FALSE) else() # Intel (in Linux) - set(OFA_map_knl "-xMIC-AVX512") - set(OFA_map_knm "-xMIC-AVX512") - set(OFA_map_sapphirerapids "-xCORE-AVX512") - set(OFA_map_alderlake "-xCORE-AVX512") - set(OFA_map_tigerlake "-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xCORE-AVX512") - set(OFA_map_icelake "-xCORE-AVX512") - set(OFA_map_cannonlake "-xCORE-AVX512") - set(OFA_map_cooperlake "-xCORE-AVX512") - set(OFA_map_cascadelake "-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xCORE-AVX512") - set(OFA_map_skylake "-xCORE-AVX2") - set(OFA_map_broadwell "-xCORE-AVX2") - set(OFA_map_haswell "-xCORE-AVX2") - set(OFA_map_ivybridge "-xCORE-AVX-I") - set(OFA_map_sandybridge "-xAVX") + set(OFA_map_knl "-xKNL;-xMIC-AVX512") + set(OFA_map_knm "-xKNM;-xMIC-AVX512") + set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") + set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") + set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") + set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") + set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") + set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") + set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") + set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") + set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") + set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") + set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") + set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") set(OFA_map_westmere "-xSSE4.2") set(OFA_map_nehalem "-xSSE4.2") set(OFA_map_penryn "-xSSSE3") @@ -1249,7 +1264,12 @@ macro(OFA_HandleX86Options) endif() foreach(arch ${_march_flag_list}) if(DEFINED OFA_map_${arch}) - AddCompilerFlag(${OFA_map_${arch}} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + foreach(flag ${OFA_map_${arch}}) + AddCompilerFlag(${flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + if(_ok) + break() + endif() + endforeach() if(_ok) break() endif() @@ -1264,9 +1284,9 @@ macro(OFA_HandleX86Options) AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - + # TEST PGI/Cray/SunPro ... - + else() # Others: GNU, Clang and variants foreach(_flag ${_march_flag_list}) @@ -2063,7 +2083,7 @@ macro(OFA_HandleArmOptions) # we first try to only use the -mcpu flag. If that fails, e.g., of # the compiler does not yet support the specified target, we try to # set the -march and -mtune flags as fallback option. - + if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) @@ -2079,12 +2099,12 @@ macro(OFA_HandleArmOptions) # Others: GNU, Clang and variants AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() - + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") if(MSVC) # MSVC on ARM64 needs to be done - + else() # Others: GNU, Clang and variants foreach(_flag ${_mtune_flag_list}) @@ -2093,11 +2113,11 @@ macro(OFA_HandleArmOptions) break() endif(_good) endforeach(_flag) - + if(NOT _good) set(_disable_vector_unit_list) set(_enable_vector_unit_list) - + set(_aes_broken false) set(_bf16_broken false) set(_crc_broken false) @@ -2135,7 +2155,7 @@ macro(OFA_HandleArmOptions) set(_vfpv4_d16_broken false) set(_zcm_broken false) set(_zcz_broken false) - + macro(_enable_or_disable _name _flag _documentation _broken) if(_broken) From bddc0a4d786324c64ea58156834389b8042783ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 14 Jan 2022 19:19:17 +0100 Subject: [PATCH 137/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 196 +++++++++++++++++++++------- 1 file changed, 152 insertions(+), 44 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 4c01179fe7..ab97f06d4a 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -70,8 +70,8 @@ # # Changelog: # - Update of CPUIDs for latest Intel and AMD processors -# - Support for PPC64 (Clang, GCC, IBM XLC) -# - Support for ARM (Clang, GCC, ARM Clang) +# - Added support for PPC64 (Clang, GCC, IBM XLC) +# - Added Support for ARM (Clang, GCC, ARM Clang, Cray, Fujitsu) #============================================================================= get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) @@ -87,20 +87,38 @@ macro(_my_find _list _value _ret) endif() endmacro(_my_find) +#============================================================================= +# Autodetection of CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, AIX, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + macro(OFA_AutodetectX86) set(_vendor_id) set(_cpu_family) set(_cpu_model) set(_cpu_stepping) + # Get CPUID from system if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + + # macOS exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) if(NOT _error) @@ -114,6 +132,12 @@ macro(OFA_AutodetectX86) string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") else() # Apple Silicon (ARM64) running in Rosetta 2 mode + # + # The regular detection mechanism for macOS-x64_86 does not work + # because the emulated CPU does not provide the required + # information via the sysctl command. We therefore generate fake + # vendor, model, and stepping information based on the + # macOS-specific CPU codes. exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) if(NOT _error) @@ -167,14 +191,24 @@ macro(OFA_AutodetectX86) if(_error) message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") endif() + elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + + # Windows get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) mark_as_advanced(_vendor_id _cpu_id) string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") - endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # TODO: BSD, Android, QNX, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + + # Determine CPU from CPUID if(_vendor_id STREQUAL "GenuineIntel") if(_cpu_family EQUAL 6) # taken from the Intel ORM @@ -396,6 +430,9 @@ macro(OFA_AutodetectX86) elseif(_cpu_family EQUAL 5) # 05h (K6) endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") endif() endmacro(OFA_AutodetectX86) @@ -405,7 +442,11 @@ macro(OFA_AutodetectArm) set(_cpu_variant) set(_cpu_part) set(_cpu_revision) + + # Get CPUID from system if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*CPU implementer[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_implementer "${_cpuinfo}") string(REGEX REPLACE ".*CPU architecture[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_architecture "${_cpuinfo}") @@ -416,7 +457,6 @@ macro(OFA_AutodetectArm) elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") exec_program("/usr/sbin/sysctl -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) - message(${_sysctl_output_string}) if(NOT _error) string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) list(GET _sysctl_output 0 _cpu_implementer) @@ -426,14 +466,21 @@ macro(OFA_AutodetectArm) list(GET _sysctl_output 4 _cpu_revision) endif() if(_error) - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") endif() - elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") - endif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # TODO: Windows, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + + # Determine CPU from CPUID # Taken from https://github.com/karelzak/util-linux/blob/master/sys-utils/lscpu-arm.c # and https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html - if(_cpu_implementer STREQUAL "0x41") # ARM + + # ARM + if(_cpu_implementer STREQUAL "0x41") if(_cpu_part STREQUAL "0x810") set(TARGET_ARCHITECTURE "arm810") elseif(_cpu_part STREQUAL "0x920") @@ -550,7 +597,8 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "cortex-a78c") endif() - elseif(_cpu_implementer STREQUAL "0x42") # Broadcom + # Broadcom + elseif(_cpu_implementer STREQUAL "0x42") if(_cpu_part STREQUAL "0x0f") set(TARGET_ARCHITECTURE "brahma-b15") elseif(_cpu_part STREQUAL "0x100") @@ -559,7 +607,8 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "thunderx2") endif() - elseif(_cpu_implementer STREQUAL "0x43") # Cavium + # Cavium + elseif(_cpu_implementer STREQUAL "0x43") if(_cpu_part STREQUAL "0x0a0") set(TARGET_ARCHITECTURE "thunderx") elseif(_cpu_part STREQUAL "0x0a1") @@ -572,28 +621,34 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "thunderx2t99") endif() - elseif(_cpu_implementer STREQUAL "0x44") # DEC + # DEC + elseif(_cpu_implementer STREQUAL "0x44") if(_cpu_part STREQUAL "0xa10") set(TARGET_ARCHITECTURE "strongarm110") elseif(_cpu_part STREQUAL "0xa11") set(TARGET_ARCHITECTURE "strongarm1100") endif() - elseif(_cpu_implementer STREQUAL "0x46") # FUJITSU + # FUJITSU + elseif(_cpu_implementer STREQUAL "0x46") if(_cpu_part STREQUAL "0x001") set(TARGET_ARCHITECTURE "a64fx") endif() - elseif(_cpu_implementer STREQUAL "0x48") # HiSilicon + # HiSilicon + elseif(_cpu_implementer STREQUAL "0x48") if(_cpu_part STREQUAL "0xd01") set(TARGET_ARCHITECTURE "tsv110") endif() - elseif(_cpu_implementer STREQUAL "0x49") # Infineon + # Infineon + elseif(_cpu_implementer STREQUAL "0x49") - elseif(_cpu_implementer STREQUAL "0x4d") # Motorola/Freescale + # Motorola/Freescale + elseif(_cpu_implementer STREQUAL "0x4d") - elseif(_cpu_implementer STREQUAL "0x4e") # Nvidia + # Nvidia + elseif(_cpu_implementer STREQUAL "0x4e") if(_cpu_part STREQUAL "0x000") set(TARGET_ARCHITECTURE "denver") elseif(_cpu_part STREQUAL "0x003") @@ -602,12 +657,14 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "carmel") endif() - elseif(_cpu_implementer STREQUAL "0x50") # APM + # APM + elseif(_cpu_implementer STREQUAL "0x50") if(_cpu_part STREQUAL "0x000") set(TARGET_ARCHITECTURE "xgene1") endif() - elseif(_cpu_implementer STREQUAL "0x51") # Qualcomm + # Qualcomm + elseif(_cpu_implementer STREQUAL "0x51") if(_cpu_part STREQUAL "0x00f") set(TARGET_ARCHITECTURE "scorpion") elseif(_cpu_part STREQUAL "0x02d") @@ -632,12 +689,14 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "saphira") endif() - elseif(_cpu_implementer STREQUAL "0x53") # Samsung + # Samsung + elseif(_cpu_implementer STREQUAL "0x53") if(_cpu_part STREQUAL "0x001") set(TARGET_ARCHITECTURE "exynos-m1") endif() - elseif(_cpu_implementer STREQUAL "0x56") # Marvell + # Marvell + elseif(_cpu_implementer STREQUAL "0x56") if(_cpu_part STREQUAL "0x131") set(TARGET_ARCHITECTURE "marvell-f") elseif(_cpu_part STREQUAL "0x581") @@ -646,21 +705,24 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "marvell-pj4") endif() - elseif(_cpu_implementer STREQUAL "0x61") # Apple + # Apple + elseif(_cpu_implementer STREQUAL "0x61") if(_cpu_part STREQUAL "0x022") set(TARGET_ARCHITECTURE "icestorm") elseif(_cpu_part STREQUAL "0x023") set(TARGET_ARCHITECTURE "firestorm") endif() - elseif(_cpu_implementer STREQUAL "0x66") # Faraday + # Faraday + elseif(_cpu_implementer STREQUAL "0x66") if(_cpu_part STREQUAL "0x526") set(TARGET_ARCHITECTURE "fa526") elseif(_cpu_part STREQUAL "0x626") set(TARGET_ARCHITECTURE "fa626") endif() - elseif(_cpu_implementer STREQUAL "0x69") # Intel + # Intel + elseif(_cpu_implementer STREQUAL "0x69") if(_cpu_part STREQUAL "0x200") set(TARGET_ARCHITECTURE "i80200") elseif(_cpu_part STREQUAL "0x210") @@ -705,16 +767,18 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "ipx1200") endif() - elseif(_cpu_implementer STREQUAL "0x70") # Phytium + # Phytium + elseif(_cpu_implementer STREQUAL "0x70") if(_cpu_part STREQUAL "0x662") set(TARGET_ARCHITECTURE "ftc662") elseif(_cpu_part STREQUAL "0x663") set(TARGET_ARCHITECTURE "ftc663") endif() - elseif(_cpu_implementer STREQUAL "0xc0") # Ampere + # Ampere + elseif(_cpu_implementer STREQUAL "0xc0") - # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 if( _cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) set(TARGET_ARCHITECTURE "apple-a6") @@ -736,6 +800,9 @@ macro(OFA_AutodetectArm) set(TARGET_ARCHITECTURE "apple-m1") elseif(_cpu_part STREQUAL "0xda33d83d" OR _cpu_part STREQUAL "3660830781") # Blizzard Avalanche (A15) endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") endif() endmacro(OFA_AutodetectArm) @@ -743,16 +810,38 @@ macro(OFA_AutodetectPpc) set(_cpu) if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux file(READ "/proc/cpuinfo" _cpuinfo) string(REGEX REPLACE ".*cpu[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu "${_cpuinfo}") - if(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") + if(_cpu STREQUAL "POWER3") + set(TARGET_ARCHITECTURE "power3") + elseif(_cpu STREQUAL "POWER4") + set(TARGET_ARCHITECTURE "power4") + elseif(_cpu STREQUAL "POWER5") + set(TARGET_ARCHITECTURE "power5") + elseif(_cpu STREQUAL "POWER5+") + set(TARGET_ARCHITECTURE "power5+") + elseif(_cpu STREQUAL "POWER6") + set(TARGET_ARCHITECTURE "power6") + elseif(_cpu STREQUAL "POWER6X") + set(TARGET_ARCHITECTURE "power6x") + elseif(_cpu STREQUAL "POWER7") + set(TARGET_ARCHITECTURE "power7") + elseif(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") set(TARGET_ARCHITECTURE "power8") elseif(_cpu STREQUAL "POWER9" OR _cpu STREQUAL "POWER9NVL") set(TARGET_ARCHITECTURE "power9") elseif(_cpu STREQUAL "POWER10" OR _cpu STREQUAL "POWER10NVL") set(TARGET_ARCHITECTURE "power10") + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") endif() - elseif(CMAKE_SYSTEM_NAME STREQUAL "AIX") + + # TODO: AIX, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") endif() endmacro(OFA_AutodetectPpc) @@ -971,7 +1060,7 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - # AMD + # AMD elseif(TARGET_ARCHITECTURE STREQUAL "k8") list(APPEND _march_flag_list "k8") list(APPEND _available_vector_units_list "sse" "sse2") @@ -1049,7 +1138,7 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "core2") list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - # Others + # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") @@ -1061,6 +1150,7 @@ macro(OFA_HandleX86Options) message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() + # Special treatment for "native" if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) @@ -1089,7 +1179,9 @@ macro(OFA_HandleX86Options) AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() + # Apply architecture flags elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + set(_disable_vector_unit_list) set(_enable_vector_unit_list) if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) @@ -1285,7 +1377,7 @@ macro(OFA_HandleX86Options) endif() endif() - # TEST PGI/Cray/SunPro ... + # TODO PGI/Cray/SunPro ... else() # Others: GNU, Clang and variants @@ -1833,7 +1925,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - # Broadcom + # Broadcom elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") list(APPEND _mtune_flag_list "brahma-b15") elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") @@ -1845,7 +1937,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto") - # Cavium + # Cavium elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") list(APPEND _mtune_flag_list "thunderx") list(APPEND _march_flag_list "armv8-a") @@ -1873,7 +1965,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto") - # DEC + # DEC elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") list(APPEND _mtune_flag_list "strongarm110") list(APPEND _march_flag_list "armv4") @@ -1881,7 +1973,7 @@ macro(OFA_HandleArmOptions) list(APPEND _mtune_flag_list "strongarm1100") list(APPEND _march_flag_list "armv4") - # FUJITSU + # FUJITSU elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") list(APPEND _mtune_flag_list "a64fx") list(APPEND _march_flag_list "armv8.2-a") @@ -1890,7 +1982,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "fp16" "sve") - # HiSilicon + # HiSilicon elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") list(APPEND _mtune_flag_list "tsv110") list(APPEND _march_flag_list "armv8.2-a") @@ -1899,7 +1991,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - # Nvidia + # Nvidia elseif(TARGET_ARCHITECTURE STREQUAL "denver") list(APPEND _mtune_flag_list "denver") list(APPEND _march_flag_list "armv8-a") @@ -1918,13 +2010,13 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - # APM + # APM elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") list(APPEND _mtune_flag_list "xgene1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") - # Qualcomm + # Qualcomm elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") list(APPEND _mtune_flag_list "scorpion") list(APPEND _march_flag_list "armv7-a") @@ -1956,14 +2048,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - # Samsung + # Samsung elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") list(APPEND _mtune_flag_list "exynos-m1") list(APPEND _march_flag_list "armv8-a") list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "crypto" "simd") - # Marvell + # Marvell elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") list(APPEND _mtune_flag_list "marvell-f") list(APPEND _march_flag_list "armv5te") @@ -1972,7 +2064,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv7-a") list(APPEND _available_vector_units_list "mp" "sec" "fp") - # Intel + # Intel elseif(TARGET_ARCHITECTURE STREQUAL "i80200") list(APPEND _mtune_flag_list "i80200") elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") @@ -2016,7 +2108,7 @@ macro(OFA_HandleArmOptions) elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") list(APPEND _mtune_flag_list "ipx1200") - # Apple + # Apple elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") list(APPEND _mtune_flag_list "apple-a6") list(APPEND _march_flag_list "armv7-a") @@ -2066,7 +2158,7 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8-a") list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - # Others + # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") elseif(TARGET_ARCHITECTURE STREQUAL "native") @@ -2084,6 +2176,7 @@ macro(OFA_HandleArmOptions) # the compiler does not yet support the specified target, we try to # set the -march and -mtune flags as fallback option. + # Special treatment for "native" if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) @@ -2100,6 +2193,7 @@ macro(OFA_HandleArmOptions) AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() + # Apply architecture flags elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") if(MSVC) @@ -2243,6 +2337,20 @@ endmacro(OFA_HandleArmOptions) macro(OFA_HandlePpcOptions) set(_march_flag_list) + if(TARGET_ARCHITECTURE STREQUAL "power3") + list(APPEND _march_flag_list "power3") + elseif(TARGET_ARCHITECTURE STREQUAL "power4") + list(APPEND _march_flag_list "power4") + elseif(TARGET_ARCHITECTURE STREQUAL "power5") + list(APPEND _march_flag_list "power5") + elseif(TARGET_ARCHITECTURE STREQUAL "power5+") + list(APPEND _march_flag_list "power5+") + elseif(TARGET_ARCHITECTURE STREQUAL "power6") + list(APPEND _march_flag_list "power6") + elseif(TARGET_ARCHITECTURE STREQUAL "power6x") + list(APPEND _march_flag_list "power6x") + elseif(TARGET_ARCHITECTURE STREQUAL "power7") + list(APPEND _march_flag_list "power7") if(TARGET_ARCHITECTURE STREQUAL "power8") list(APPEND _march_flag_list "power8") list(APPEND _march_flag_list "pwr8") From 1bf57e502c9fd5b543d7fa5df2d71e7a115d92fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 14 Jan 2022 21:48:22 +0100 Subject: [PATCH 138/174] Updated OFA --- cmake/OptimizeForArchitecture.cmake | 429 +++++++++++++++++++--------- 1 file changed, 289 insertions(+), 140 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index ab97f06d4a..2bbb5d2b75 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -853,12 +853,26 @@ macro(OFA_AutodetectHostArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") OFA_AutodetectArm() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") - OFA_AutodetectPpc() +# OFA_AutodetectPpc() else() message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture.cmake") endif() endmacro(OFA_AutodetectHostArchitecture) +#============================================================================= +# Handling of CPUs +# +# This is a two-step process: +# +# 1. Generate a list of compiler flags for the specific CPU +# +# 2. Special compiler-specific treatment of "native" flag +# +# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options +# +# 4. Set compiler-specific flags +#============================================================================= + macro(OFA_HandleX86Options) set(_march_flag_list) set(_available_vector_units_list) @@ -983,6 +997,8 @@ macro(OFA_HandleX86Options) _goldmont_plus() endmacro() + # TODO: Define similar macros for AMD + # Intel if(TARGET_ARCHITECTURE STREQUAL "core") list(APPEND _march_flag_list "core2") @@ -1181,9 +1197,11 @@ macro(OFA_HandleX86Options) # Apply architecture flags elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - + + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options set(_disable_vector_unit_list) set(_enable_vector_unit_list) + if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) message(STATUS "SSE disabled because of old/broken toolchain") set(_sse_broken true) @@ -1229,6 +1247,7 @@ macro(OFA_HandleX86Options) endif() endif() + # Enable/disable macro macro(_enable_or_disable _name _flag _documentation _broken) if(_broken) set(_found false) @@ -1243,6 +1262,8 @@ macro(OFA_HandleX86Options) list(APPEND _disable_vector_unit_list "${_flag}") endif() endmacro() + + # Enable/disable features _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) _enable_or_disable(AVX512BF16 "avx512bf16" "Use AVX512BF16." _avx512_broken) @@ -1272,12 +1293,17 @@ macro(OFA_HandleX86Options) _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." _sse_broken) _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) + # Add compiler flags if(MSVC AND MSVC_VERSION GREATER 1700) - # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX) - # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010) - _my_find(_enable_vector_unit_list "avx2" _found) + _my_find(_enable_vector_unit_list "avx512f" _found) if(_found) - AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _my_find(_enable_vector_unit_list "avx2" _found) + if(_found) + AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() endif() if(NOT _found) _my_find(_enable_vector_unit_list "avx" _found) @@ -1291,6 +1317,12 @@ macro(OFA_HandleX86Options) AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() + if(NOT _found) + _my_find(_enable_vector_unit_list "sse" _found) + if(_found) + AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() foreach(_flag ${_enable_vector_unit_list}) string(TOUPPER "${_flag}" _flag) string(REPLACE "." "_" _flag "__${_flag}__") @@ -1381,12 +1413,16 @@ macro(OFA_HandleX86Options) else() # Others: GNU, Clang and variants + + # Set -march flag foreach(_flag ${_march_flag_list}) AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) if(_good) break() endif(_good) endforeach(_flag) + + # Set -mfeature flag for enabled features foreach(_flag ${_enable_vector_unit_list}) AddCompilerFlag("-m${_flag}" CXX_RESULT _result) if(_result) @@ -1440,6 +1476,8 @@ macro(OFA_HandleX86Options) endif() endif() endforeach(_flag) + + # Set -mno-feature flag for disabled features foreach(_flag ${_disable_vector_unit_list}) AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_flag) @@ -2170,12 +2208,6 @@ macro(OFA_HandleArmOptions) message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() - # Following the recommendation from - # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu - # we first try to only use the -mcpu flag. If that fails, e.g., of - # the compiler does not yet support the specified target, we try to - # set the -march and -mtune flags as fallback option. - # Special treatment for "native" if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) @@ -2192,15 +2224,146 @@ macro(OFA_HandleArmOptions) # Others: GNU, Clang and variants AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() - + # Apply architecture flags elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - if(MSVC) - # MSVC on ARM64 needs to be done - + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + # TODO: Add OFA_xxx_INTRINSICS_BROKEN rules + set(_aes_broken false) + set(_bf16_broken false) + set(_crc_broken false) + set(_crypto_broken false) + set(_dotprod_broken false) + set(_dsp_broken false) + set(_fp16_broken false) + set(_fp16fml_broken false) + set(_fp_broken false) + set(_fp_dp_broken false) + set(_fp_sp_broken false) + set(_i8mm_broken false) + set(_idiv_broken false) + set(_lse_broken false) + set(_mve_broken false) + set(_mve_fp_broken false) + set(_neon_broken false) + set(_neon_fp16_broken false) + set(_neon_vfpv4_broken false) + set(_ras_broken false) + set(_rcpc_broken false) + set(_rdm_broken false) + set(_rdma_broken false) + set(_sec_broken false) + set(_sha2_broken false) + set(_sha3_broken false) + set(_simd_broken false) + set(_sm4_broken false) + set(_sve_broken false) + set(_vfpv3_broken false) + set(_vfpv3_d16_broken false) + set(_vfpv3_d16_fp16_broken false) + set(_vfpv3_fp16_broken false) + set(_vfpv4_broken false) + set(_vfpv4_d16_broken false) + set(_zcm_broken false) + set(_zcz_broken false) + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) + _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) + _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) + _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) + _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) + _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) + _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) + _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) + _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) + _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) + _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) + _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) + _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) + _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) + _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) + _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) + _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) + _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) + _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) + _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) + _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) + _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) + _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) + _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) + _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) + _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) + _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) + _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) + _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) + _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) + _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) + _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) + _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) + _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) + _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) + _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) + _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) + + # Add compiler flags + if(MSVC AND MSVC_VERSION GREATER 1900) + _my_find(_enable_vector_unit_list "vfpv4" _found) + if(_found) + AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _my_find(_enable_vector_unit_list "simd" _found) + if(_found) + AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + foreach(_flag ${_enable_vector_unit_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "." "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + + # TODO: Add Cray flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") + + # TODO: Add Fujitsu flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + # TODO: Add NVHPC flags + else() # Others: GNU, Clang and variants + + # Following the recommendation from + # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu + # we first try to only use the -mcpu flag. If that fails, e.g., if + # the compiler does not yet support the specified target, we try to + # set the -march and -mtune flags as fallback option. foreach(_flag ${_mtune_flag_list}) AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) if(_good) @@ -2208,103 +2371,10 @@ macro(OFA_HandleArmOptions) endif(_good) endforeach(_flag) + # Fallback: set -march, -mtune flags if(NOT _good) - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - macro(_enable_or_disable _name _flag _documentation _broken) - - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - foreach(_march ${_march_flag_list}) - + # Set -march flag + foreach(_march ${_march_flag_list}) AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) if(_good) set(_march_plus_extensions "${_march}") @@ -2319,6 +2389,7 @@ macro(OFA_HandleArmOptions) endif(_good) endforeach(_march) + # Set -mtune flag foreach(_mtune ${_mtune_flag_list}) AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) if(_good) @@ -2326,6 +2397,11 @@ macro(OFA_HandleArmOptions) endif(_good) endforeach(_mtune) endif(NOT _good) + + # Note that ARM does not support -mfeature and -mno-feature to + # enable and disable specific features. Hence, there are no + # loops over the _enable_vector_unit_list and + # _disable_vector_unit_list lists here(!) endif() endif() @@ -2337,51 +2413,124 @@ endmacro(OFA_HandleArmOptions) macro(OFA_HandlePpcOptions) set(_march_flag_list) - if(TARGET_ARCHITECTURE STREQUAL "power3") + set(_available_vector_units_list) + + # Define macros for PowerPC64 + macro(_power3) list(APPEND _march_flag_list "power3") - elseif(TARGET_ARCHITECTURE STREQUAL "power4") + endmacro() + macro(_power4) list(APPEND _march_flag_list "power4") - elseif(TARGET_ARCHITECTURE STREQUAL "power5") + _power3() + endmacro() + macro(_power5) list(APPEND _march_flag_list "power5") - elseif(TARGET_ARCHITECTURE STREQUAL "power5+") + _power4() + endmacro() + macro(_power5plus) list(APPEND _march_flag_list "power5+") - elseif(TARGET_ARCHITECTURE STREQUAL "power6") + _power5() + endmacro() + macro(_power6) list(APPEND _march_flag_list "power6") - elseif(TARGET_ARCHITECTURE STREQUAL "power6x") + _power5() + endmacro() + macro(_power6x) list(APPEND _march_flag_list "power6x") - elseif(TARGET_ARCHITECTURE STREQUAL "power7") + _power6() + endmacro() + macro(_power7) list(APPEND _march_flag_list "power7") - if(TARGET_ARCHITECTURE STREQUAL "power8") + _power6() + endmacro() + macro(_power8) list(APPEND _march_flag_list "power8") list(APPEND _march_flag_list "pwr8") - elseif(TARGET_ARCHITECTURE STREQUAL "power9") + _power7() + endmacro() + macro(_power9) list(APPEND _march_flag_list "power9") - list(APPEND _march_flag_list "power8") list(APPEND _march_flag_list "pwr9") - list(APPEND _march_flag_list "pwr8") - elseif(TARGET_ARCHITECTURE STREQUAL "power10") + _power8() + endmacro() + macro(_power10) list(APPEND _march_flag_list "power10") - list(APPEND _march_flag_list "power9") - list(APPEND _march_flag_list "power8") list(APPEND _march_flag_list "pwr10") - list(APPEND _march_flag_list "pwr9") - list(APPEND _march_flag_list "pwr8") + _power9() + endmacro() + + # PowerPC64 + if(TARGET_ARCHITECTURE STREQUAL "power3") + _power3() + elseif(TARGET_ARCHITECTURE STREQUAL "power4") + _power4() + elseif(TARGET_ARCHITECTURE STREQUAL "power5") + _power5() + elseif(TARGET_ARCHITECTURE STREQUAL "power5+") + _power5plus() + elseif(TARGET_ARCHITECTURE STREQUAL "power6") + _power6() + elseif(TARGET_ARCHITECTURE STREQUAL "power6x") + _power6x() + elseif(TARGET_ARCHITECTURE STREQUAL "power7") + _power7() + elseif(TARGET_ARCHITECTURE STREQUAL "power8") + _power8() + elseif(TARGET_ARCHITECTURE STREQUAL "power9") + _power9() + elseif(TARGET_ARCHITECTURE STREQUAL "power10") + _power10() + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "native") + list(APPEND _march_flag_list "native") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() - foreach(_flag ${_march_flag_list}) - if(CMAKE_CXX_COMPILER MATCHES "/(pgcc|pgc\\+\\+)$") - # PGI (on Linux) - AddCompilerFlag("-tp=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + # Special treatment for "native" + if(TARGET_ARCHITECTURE STREQUAL "native") + + # Apply architecture flags + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + + # Add compiler flags + if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "XL") + else() - AddCompilerFlag("-mcpu=${_mcpu}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + # Others: GNU, Clang and variants + + endif() - if(_good) - break() - endif(_good) - endforeach(_flag) - - if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - AddCompilerFlag("-target powerpcle-unknown-linux-gnu" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() endmacro(OFA_HandlePpcOptions) From f81c595454fef22c3c879f40972b06c1807985bc Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 17 Jan 2022 20:30:36 +0100 Subject: [PATCH 139/174] Update OFA --- cmake/OptimizeForArchitecture.cmake | 2475 +---------------- cmake/{ => ofa}/AddCompilerFlag.cmake | 9 +- cmake/ofa/AutodetectArm.cmake | 390 +++ cmake/ofa/AutodetectPpc.cmake | 55 + cmake/ofa/AutodetectX86.cmake | 356 +++ cmake/{ => ofa}/CheckCCompilerFlag.cmake | 0 cmake/{ => ofa}/CheckCXXCompilerFlag.cmake | 0 cmake/{ => ofa}/CheckMicCCompilerFlag.cmake | 0 cmake/{ => ofa}/CheckMicCXXCompilerFlag.cmake | 0 cmake/ofa/HandleArmOptions.cmake | 926 ++++++ cmake/ofa/HandlePpcOptions.cmake | 139 + cmake/ofa/HandleX86Options.cmake | 803 ++++++ cmake/ofa/IntrinsicsX86.txt | 92 + 13 files changed, 2790 insertions(+), 2455 deletions(-) rename cmake/{ => ofa}/AddCompilerFlag.cmake (96%) create mode 100644 cmake/ofa/AutodetectArm.cmake create mode 100644 cmake/ofa/AutodetectPpc.cmake create mode 100644 cmake/ofa/AutodetectX86.cmake rename cmake/{ => ofa}/CheckCCompilerFlag.cmake (100%) rename cmake/{ => ofa}/CheckCXXCompilerFlag.cmake (100%) rename cmake/{ => ofa}/CheckMicCCompilerFlag.cmake (100%) rename cmake/{ => ofa}/CheckMicCXXCompilerFlag.cmake (100%) create mode 100644 cmake/ofa/HandleArmOptions.cmake create mode 100644 cmake/ofa/HandlePpcOptions.cmake create mode 100644 cmake/ofa/HandleX86Options.cmake create mode 100644 cmake/ofa/IntrinsicsX86.txt diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 2bbb5d2b75..a6bb8e9460 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -17,22 +17,9 @@ # TARGET_ARCHITECTURE= specifies the target architecture (default=auto) # TARGET_PROFILER= specifies the target profiler (default=none) # -# If any of these flags are defined and set, the OptimizeForArchitecture -# macro will consequently disable the relevant features via compiler flags. -# -# For x86_64/x68: -# OFA_AVX512_INTRINSICS_BROKEN -# OFA_AVX2_INTRINSICS_BROKEN -# OFA_AVX_INTRINSICS_BROKEN -# OFA_FMA4_INTRINSICS_BROKEN -# OFA_SSE_INTRINSICS_BROKEN -# OFA_XOP_INTRINSICS_BROKEN -# -# For ARM: -# no options defined yet -# -# For PPC64: -# no options defined yet +# If any of the _broken flags are defined and set to true, +# the OptimizeForArchitecture macro will consequently disable the +# relevant features via compiler flags. # # Output: # OFA_ARCHITECTURE_FLAGS compiler flags optimized for the target architecture @@ -74,9 +61,13 @@ # - Added Support for ARM (Clang, GCC, ARM Clang, Cray, Fujitsu) #============================================================================= -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/AddCompilerFlag.cmake") -include(CheckIncludeFileCXX) +include(ofa/AutodetectX86) +include(ofa/AutodetectArm) +include(ofa/AutodetectPpc) + +include(ofa/HandleX86Options) +include(ofa/HandleArmOptions) +include(ofa/HandlePpcOptions) macro(_my_find _list _value _ret) list(FIND ${_list} "${_value}" _found) @@ -88,2462 +79,44 @@ macro(_my_find _list _value _ret) endmacro(_my_find) #============================================================================= -# Autodetection of CPUs -# -# This is a two-step process: -# -# 1. Get the CPUID from the system by reading /proc/cpuconfig (on -# Linux), the system registry (on Windows), or executing an -# OS-specific command (macOS, BSD, AIX, SunOS, ...) -# -# 2. Determine the specific CPU from the CPUID +# Autodetection of CPU #============================================================================= -macro(OFA_AutodetectX86) - set(_vendor_id) - set(_cpu_family) - set(_cpu_model) - set(_cpu_stepping) - - # Get CPUID from system - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - - # Linux - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") - string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") - string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") - string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") - string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") - - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - - # macOS - exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" - OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) - if(NOT _error) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _vendor_id) - list(GET _sysctl_output 1 _cpu_family) - list(GET _sysctl_output 2 _cpu_model) - list(GET _sysctl_output 3 _cpu_stepping) - list(GET _sysctl_output 4 _cpu_flags) - string(TOLOWER "${_cpu_flags}" _cpu_flags) - string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") - else() - # Apple Silicon (ARM64) running in Rosetta 2 mode - # - # The regular detection mechanism for macOS-x64_86 does not work - # because the emulated CPU does not provide the required - # information via the sysctl command. We therefore generate fake - # vendor, model, and stepping information based on the - # macOS-specific CPU codes. - exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" - OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) - if(NOT _error) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _cpu_implementer) - list(GET _sysctl_output 1 _cpu_family) - list(GET _sysctl_output 2 _cpu_model) - list(GET _sysctl_output 3 _cpu_flags) - string(TOLOWER "${_cpu_flags}" _cpu_flags) - string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") - - # Fake vendor - if(_cpu_implementer STREQUAL "0x7" OR _cpu_implementer STREQUAL "7") - set(_vendor_id "GenuineIntel") - else() - set(_vendor_id "Unknown") - endif() - - # Fake stepping - set(_cpu_stepping "Unknown") - - # Fake model - # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h - if( _cpu_model STREQUAL "0x78ea4fbc" OR _cpu_model STREQUAL "2028621756") # Penryn - set(_cpu_model "23") - elseif(_cpu_model STREQUAL "0x6b5a4cd2" OR _cpu_model STREQUAL "1801080018") # Nehalem - set(_cpu_model "26") - elseif(_cpu_model STREQUAL "0x573b5eec" OR _cpu_model STREQUAL "1463508716") # Westmere - set(_cpu_model "37") - elseif(_cpu_model STREQUAL "0x5490b78c" OR _cpu_model STREQUAL "1418770316") # Sandybridge - set(_cpu_model "42") - elseif(_cpu_model STREQUAL "0x1f65e835" OR _cpu_model STREQUAL "526772277") # Ivybridge - set(_cpu_model "58") - elseif(_cpu_model STREQUAL "0x10b282dc" OR _cpu_model STREQUAL "280134364") # Haswell - set(_cpu_model "60") - elseif(_cpu_model STREQUAL "0x582ed09c" OR _cpu_model STREQUAL "1479463068") # Broadwell - set(_cpu_model "61") - elseif(_cpu_model STREQUAL "0x37fc219f" OR _cpu_model STREQUAL "939270559") # Skylake - set(_cpu_model "78") - elseif(_cpu_model STREQUAL "0x0f817246" OR _cpu_model STREQUAL "260141638") # Kabylake - set(_cpu_model "142") - elseif(_cpu_model STREQUAL "0x38435547" OR _cpu_model STREQUAL "943936839") # Icelake - set(_cpu_model "125") - elseif(_cpu_model STREQUAL "0x1cf8a03e" OR _cpu_model STREQUAL "486055998") # Cometlake - set(_cpu_model "142") - else() - set(_cpu_model "Unknown") - endif() - endif() - endif() - if(_error) - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") - endif() - - elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") - - # Windows - get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) - get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) - mark_as_advanced(_vendor_id _cpu_id) - string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") - string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") - string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") - - # TODO: BSD, Android, QNX, ... - - else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") - endif() - - # Determine CPU from CPUID - if(_vendor_id STREQUAL "GenuineIntel") - if(_cpu_family EQUAL 6) - # taken from the Intel ORM - # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html - # CPUID Signature Values of Of Recent Intel Microarchitectures - # 4E 5E | Skylake microarchitecture - # 3D 47 56 | Broadwell microarchitecture - # 3C 45 46 3F | Haswell microarchitecture - # 3A 3E | Ivy Bridge microarchitecture - # 2A 2D | Sandy Bridge microarchitecture - # 25 2C 2F | Intel microarchitecture Westmere - # 1A 1E 1F 2E | Intel microarchitecture Nehalem - # 17 1D | Enhanced Intel Core microarchitecture - # 0F | Intel Core microarchitecture - # - # Intel SDM Vol. 3C 35-1 / December 2016: - # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing] - # 85 | Future Xeon Phi - # 8E 9E | 7th gen. Core [Kaby Lake] - # 55 | Future Xeon [Skylake w/ AVX512] - # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512] - # 56 | Xeon D-1500 [Broadwell] - # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell] - # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell] - # 3D | M-5xxx / 5th gen. [Broadwell] - # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E] - # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell] - # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E] - # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge] - # 2D | Xeon E5, i7-39xx [Sandy Bridge] - # 2F | Xeon E7 - # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge] - # 2E | Xeon 7500, 6500 series - # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3 - # - # Values from the Intel SDE: - # 5C | Goldmont - # 5A | Silvermont - # 57 | Knights Landing - # 66 | Cannonlake - # 55 | Skylake Server - # 4E | Skylake Client - # 3C | Broadwell (likely a bug in the SDE) - # 3C | Haswell - # - # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid - - # MIC architecture - if(_cpu_model EQUAL 133) - set(TARGET_ARCHITECTURE "knm") # Knights Mill - - elseif(_cpu_model EQUAL 87) - set(TARGET_ARCHITECTURE "knl") # Knights Landing - - # Small cores - elseif(_cpu_model EQUAL 134) - set(TARGET_ARCHITECTURE "tremont") - - elseif(_cpu_model EQUAL 122) - set(TARGET_ARCHITECTURE "goldmont-plus") - - elseif(_cpu_model EQUAL 92 OR _cpu_model EQUAL 95) - set(TARGET_ARCHITECTURE "goldmont") - - elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 93 OR _cpu_model EQUAL 74 OR _cpu_model EQUAL 76 OR _cpu_model EQUAL 77 OR _cpu_model EQUAL 55) - set(TARGET_ARCHITECTURE "silvermont") - - elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) - set(TARGET_ARCHITECTURE "bonnell") - - # Big cores - elseif(_cpu_model EQUAL 167) - set(TARGET_ARCHITECTURE "rocketlake") - - elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) - set(TARGET_ARCHITECTURE "alderlake") - - elseif(_cpu_model EQUAL 143) - set(TARGET_ARCHITECTURE "sapphirerapids") - - elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) - set(TARGET_ARCHITECTURE "kabylake") - - elseif(_cpu_model EQUAL 140) - set(TARGET_ARCHITECTURE "tigerlake") - - elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) - set(TARGET_ARCHITECTURE "icelake") - - elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) - set(TARGET_ARCHITECTURE "icelake-avx512") - - elseif(_cpu_model EQUAL 102) - set(TARGET_ARCHITECTURE "cannonlake") - - elseif(_cpu_model EQUAL 85) - if(_cpu_stepping LESS 5) - set(TARGET_ARCHITECTURE "skylake-avx512") - elseif(_cpu_stepping LESS 8) - set(TARGET_ARCHITECTURE "cascadelake") - else() - set(TARGET_ARCHITECTURE "cooperlake") - endif() - - elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) - set(TARGET_ARCHITECTURE "skylake") - - elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) - set(TARGET_ARCHITECTURE "broadwell") - - elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) - set(TARGET_ARCHITECTURE "haswell") - - elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) - set(TARGET_ARCHITECTURE "ivybridge") - - elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) - set(TARGET_ARCHITECTURE "sandybridge") - - elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) - set(TARGET_ARCHITECTURE "westmere") - - elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) - set(TARGET_ARCHITECTURE "nehalem") - - elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) - set(TARGET_ARCHITECTURE "penryn") - - elseif(_cpu_model EQUAL 15 OR _cpu_model EQUAL 22) - set(TARGET_ARCHITECTURE "merom") - - elseif(_cpu_model EQUAL 28) - set(TARGET_ARCHITECTURE "atom") - - elseif(_cpu_model EQUAL 14) - set(TARGET_ARCHITECTURE "core") - - elseif(_cpu_model LESS 14) - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") - set(TARGET_ARCHITECTURE "generic") - else() - message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") - set(TARGET_ARCHITECTURE "merom") - endif() - - elseif(_cpu_family EQUAL 7) # Itanium (not supported) - message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") - - elseif(_cpu_family EQUAL 15) # NetBurst - list(APPEND _available_vector_units_list "sse" "sse2") - if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - endif() - - endif() - - elseif(_vendor_id STREQUAL "AuthenticAMD") - # taken from the list of AMD CPU microarchitectures - # https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures - # CPUID Signature Values of Of Recent AMD Microarchitectures - # 05 05h | K6 - # 06 06h | K7 - # 15 0Fh | K8 / Hammer - # 16 10h | K10 - # 17 11h | K8 & K10 "hybrid" - # 18 12h | K10 (Llano) / K12 (ARM based AMD cpu) - # 20 14h | Bobcat - # 21 15h | Bulldozer / Piledriver / Steamroller / Excavator - # 22 16h | Jaguar / Puma - # 23 17h | Zen / Zen+ / Zen 2 - # 24 18h | Hygon Dhyana - # 25 19h | Zen 3 - - if(_cpu_family EQUAL 25) # 19h - set(TARGET_ARCHITECTURE "zen3") - - elseif(_cpu_family EQUAL 24) # 18h - set(TARGET_ARCHITECTURE "zen") - - elseif(_cpu_family EQUAL 23) # 17h - if(_cpu_model LESS 49) - set(TARGET_ARCHITECTURE "zen") - else() - set(TARGET_ARCHITECTURE "zen2") - endif() - - elseif(_cpu_family EQUAL 22) # 16h - set(TARGET_ARCHITECTURE "amd16h") - - elseif(_cpu_family EQUAL 21) # 15h - if(_cpu_model LESS 16) - set(TARGET_ARCHITECTURE "bulldozer") - elseif(_cpu_model LESS 32) - set(TARGET_ARCHITECTURE "piledriver") - elseif(_cpu_model LESS 64) - set(TARGET_ARCHITECTURE "steamroller") - else() - set(TARGET_ARCHITECTURE "excavator") - endif() - - elseif(_cpu_family EQUAL 20) # 14h - set(TARGET_ARCHITECTURE "amd14h") - - elseif(_cpu_family EQUAL 18) # 12h (K10 / K12) - - elseif(_cpu_family EQUAL 17) # 12h (K8 & K10 hybrid) - - elseif(_cpu_family EQUAL 16) # 10h (K10) - set(TARGET_ARCHITECTURE "barcelona") - - elseif(_cpu_family EQUAL 15) # 0Fh (K8 / Hammer) - if(_cpu_model LESS 39) - set(TARGET_ARCHITECTURE "k8") - else() - set(TARGET_ARCHITECTURE "k8-sse3") - endif() - - elseif(_cpu_family EQUAL 6) # 06h (K7) - elseif(_cpu_family EQUAL 5) # 05h (K6) - - endif() - - else() - message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") - endif() -endmacro(OFA_AutodetectX86) - -macro(OFA_AutodetectArm) - set(_cpu_implementer) - set(_cpu_architecture) - set(_cpu_variant) - set(_cpu_part) - set(_cpu_revision) - - # Get CPUID from system - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - - # Linux - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*CPU implementer[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_implementer "${_cpuinfo}") - string(REGEX REPLACE ".*CPU architecture[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_architecture "${_cpuinfo}") - string(REGEX REPLACE ".*CPU variant[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_variant "${_cpuinfo}") - string(REGEX REPLACE ".*CPU part[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_part "${_cpuinfo}") - string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") - string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") - elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - exec_program("/usr/sbin/sysctl -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" - OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) - if(NOT _error) - string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) - list(GET _sysctl_output 0 _cpu_implementer) - list(GET _sysctl_output 1 _cpu_architecture) - list(GET _sysctl_output 2 _cpu_variant) - list(GET _sysctl_output 3 _cpu_part) - list(GET _sysctl_output 4 _cpu_revision) - endif() - if(_error) - message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") - endif() - - # TODO: Windows, FreeBSD, ... - - else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") - endif() - - # Determine CPU from CPUID - # Taken from https://github.com/karelzak/util-linux/blob/master/sys-utils/lscpu-arm.c - # and https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html - - # ARM - if(_cpu_implementer STREQUAL "0x41") - if(_cpu_part STREQUAL "0x810") - set(TARGET_ARCHITECTURE "arm810") - elseif(_cpu_part STREQUAL "0x920") - set(TARGET_ARCHITECTURE "arm920t") - elseif(_cpu_part STREQUAL "0x922") - set(TARGET_ARCHITECTURE "arm922t") - elseif(_cpu_part STREQUAL "0x926") - set(TARGET_ARCHITECTURE "arm926ej-s") - elseif(_cpu_part STREQUAL "0x940") - set(TARGET_ARCHITECTURE "arm940t") - elseif(_cpu_part STREQUAL "0x946") - set(TARGET_ARCHITECTURE "arm946e-s") - elseif(_cpu_part STREQUAL "0x966") - set(TARGET_ARCHITECTURE "arm966e-s") - elseif(_cpu_part STREQUAL "0xa20") - set(TARGET_ARCHITECTURE "arm1020e") - elseif(_cpu_part STREQUAL "0xa22") - set(TARGET_ARCHITECTURE "arm1022e") - elseif(_cpu_part STREQUAL "0xa26") - set(TARGET_ARCHITECTURE "arm1026ej-s") - elseif(_cpu_part STREQUAL "0xb02") - set(TARGET_ARCHITECTURE "mpcore") - elseif(_cpu_part STREQUAL "0xb36") - set(TARGET_ARCHITECTURE "arm1136jf-s") - elseif(_cpu_part STREQUAL "0xb56") - set(TARGET_ARCHITECTURE "arm1156t2f-s") - elseif(_cpu_part STREQUAL "0xb76") - set(TARGET_ARCHITECTURE "arm1176jzf-s") - elseif(_cpu_part STREQUAL "0xc05") - set(TARGET_ARCHITECTURE "cortex-a5") - elseif(_cpu_part STREQUAL "0xc07") - set(TARGET_ARCHITECTURE "cortex-a7") - elseif(_cpu_part STREQUAL "0xc08") - set(TARGET_ARCHITECTURE "cortex-a8") - elseif(_cpu_part STREQUAL "0xc09") - set(TARGET_ARCHITECTURE "cortex-a9") - elseif(_cpu_part STREQUAL "0xc0d") - set(TARGET_ARCHITECTURE "cortex-a12") - elseif(_cpu_part STREQUAL "0xc0f") - set(TARGET_ARCHITECTURE "cortex-a15") - elseif(_cpu_part STREQUAL "0xc0e") - set(TARGET_ARCHITECTURE "cortex-a17") - elseif(_cpu_part STREQUAL "0xc14") - set(TARGET_ARCHITECTURE "cortex-r4f") - elseif(_cpu_part STREQUAL "0xc15") - set(TARGET_ARCHITECTURE "cortex-r5") - elseif(_cpu_part STREQUAL "0xc17") - set(TARGET_ARCHITECTURE "cortex-r7") - elseif(_cpu_part STREQUAL "0xc18") - set(TARGET_ARCHITECTURE "cortex-r8") - elseif(_cpu_part STREQUAL "0xc20") - set(TARGET_ARCHITECTURE "cortex-m0") - elseif(_cpu_part STREQUAL "0xc21") - set(TARGET_ARCHITECTURE "cortex-m1") - elseif(_cpu_part STREQUAL "0xc23") - set(TARGET_ARCHITECTURE "cortex-m3") - elseif(_cpu_part STREQUAL "0xc24") - set(TARGET_ARCHITECTURE "cortex-m4") - elseif(_cpu_part STREQUAL "0xc27") - set(TARGET_ARCHITECTURE "cortex-m7") - elseif(_cpu_part STREQUAL "0xc60") - set(TARGET_ARCHITECTURE "cortex-m0plus") - elseif(_cpu_part STREQUAL "0xd01") - set(TARGET_ARCHITECTURE "cortex-a32") - elseif(_cpu_part STREQUAL "0xd02") - set(TARGET_ARCHITECTURE "cortex-a34") - elseif(_cpu_part STREQUAL "0xd03") - set(TARGET_ARCHITECTURE "cortex-a53") - elseif(_cpu_part STREQUAL "0xd04") - set(TARGET_ARCHITECTURE "cortex-a35") - elseif(_cpu_part STREQUAL "0xd05") - set(TARGET_ARCHITECTURE "cortex-a55") - elseif(_cpu_part STREQUAL "0xd07") - set(TARGET_ARCHITECTURE "cortex-a57") - elseif(_cpu_part STREQUAL "0xd08") - set(TARGET_ARCHITECTURE "cortex-a72") - elseif(_cpu_part STREQUAL "0xd09") - set(TARGET_ARCHITECTURE "cortex-a73") - elseif(_cpu_part STREQUAL "0xd0a") - set(TARGET_ARCHITECTURE "cortex-a75") - elseif(_cpu_part STREQUAL "0xd0b") - set(TARGET_ARCHITECTURE "cortex-a76") - elseif(_cpu_part STREQUAL "0xd0c") - set(TARGET_ARCHITECTURE "neoverse-n1") - elseif(_cpu_part STREQUAL "0xd0d") - set(TARGET_ARCHITECTURE "cortex-a77") - elseif(_cpu_part STREQUAL "0xd0e") - set(TARGET_ARCHITECTURE "cortex-a76ae") - elseif(_cpu_part STREQUAL "0xd13") - set(TARGET_ARCHITECTURE "cortex-r52") - elseif(_cpu_part STREQUAL "0xd20") - set(TARGET_ARCHITECTURE "cortex-m23") - elseif(_cpu_part STREQUAL "0xd21") - set(TARGET_ARCHITECTURE "cortex-m33") - elseif(_cpu_part STREQUAL "0xd40") - set(TARGET_ARCHITECTURE "neoverse-v1") - elseif(_cpu_part STREQUAL "0xd41") - set(TARGET_ARCHITECTURE "cortex-a78") - elseif(_cpu_part STREQUAL "0xd42") - set(TARGET_ARCHITECTURE "cortex-a78ae") - elseif(_cpu_part STREQUAL "0xd44") - set(TARGET_ARCHITECTURE "cortex-x1") - elseif(_cpu_part STREQUAL "0xd46") - set(TARGET_ARCHITECTURE "cortex-a510") - elseif(_cpu_part STREQUAL "0xd47") - set(TARGET_ARCHITECTURE "cortex-a710") - elseif(_cpu_part STREQUAL "0xd48") - set(TARGET_ARCHITECTURE "cortex-x2") - elseif(_cpu_part STREQUAL "0xd49") - set(TARGET_ARCHITECTURE "neoverse-n2") - elseif(_cpu_part STREQUAL "0xd4a") - set(TARGET_ARCHITECTURE "neoverse-e1") - elseif(_cpu_part STREQUAL "0xd4b") - set(TARGET_ARCHITECTURE "cortex-a78c") - endif() - - # Broadcom - elseif(_cpu_implementer STREQUAL "0x42") - if(_cpu_part STREQUAL "0x0f") - set(TARGET_ARCHITECTURE "brahma-b15") - elseif(_cpu_part STREQUAL "0x100") - set(TARGET_ARCHITECTURE "brahma-b53") - elseif(_cpu_part STREQUAL "0x516") - set(TARGET_ARCHITECTURE "thunderx2") - endif() - - # Cavium - elseif(_cpu_implementer STREQUAL "0x43") - if(_cpu_part STREQUAL "0x0a0") - set(TARGET_ARCHITECTURE "thunderx") - elseif(_cpu_part STREQUAL "0x0a1") - set(TARGET_ARCHITECTURE "thunderxt88") - elseif(_cpu_part STREQUAL "0x0a2") - set(TARGET_ARCHITECTURE "thunderxt81") - elseif(_cpu_part STREQUAL "0x0a3") - set(TARGET_ARCHITECTURE "thunderxt83") - elseif(_cpu_part STREQUAL "0x0af") - set(TARGET_ARCHITECTURE "thunderx2t99") - endif() - - # DEC - elseif(_cpu_implementer STREQUAL "0x44") - if(_cpu_part STREQUAL "0xa10") - set(TARGET_ARCHITECTURE "strongarm110") - elseif(_cpu_part STREQUAL "0xa11") - set(TARGET_ARCHITECTURE "strongarm1100") - endif() - - # FUJITSU - elseif(_cpu_implementer STREQUAL "0x46") - if(_cpu_part STREQUAL "0x001") - set(TARGET_ARCHITECTURE "a64fx") - endif() - - # HiSilicon - elseif(_cpu_implementer STREQUAL "0x48") - if(_cpu_part STREQUAL "0xd01") - set(TARGET_ARCHITECTURE "tsv110") - endif() - - # Infineon - elseif(_cpu_implementer STREQUAL "0x49") - - # Motorola/Freescale - elseif(_cpu_implementer STREQUAL "0x4d") - - # Nvidia - elseif(_cpu_implementer STREQUAL "0x4e") - if(_cpu_part STREQUAL "0x000") - set(TARGET_ARCHITECTURE "denver") - elseif(_cpu_part STREQUAL "0x003") - set(TARGET_ARCHITECTURE "denver2") - elseif(_cpu_part STREQUAL "0x004") - set(TARGET_ARCHITECTURE "carmel") - endif() - - # APM - elseif(_cpu_implementer STREQUAL "0x50") - if(_cpu_part STREQUAL "0x000") - set(TARGET_ARCHITECTURE "xgene1") - endif() - - # Qualcomm - elseif(_cpu_implementer STREQUAL "0x51") - if(_cpu_part STREQUAL "0x00f") - set(TARGET_ARCHITECTURE "scorpion") - elseif(_cpu_part STREQUAL "0x02d") - set(TARGET_ARCHITECTURE "scorpion") - elseif(_cpu_part STREQUAL "0x04d") - set(TARGET_ARCHITECTURE "krait") - elseif(_cpu_part STREQUAL "0x06f") - set(TARGET_ARCHITECTURE "krait") - elseif(_cpu_part STREQUAL "0x201") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x205") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x211") - set(TARGET_ARCHITECTURE "kryo") - elseif(_cpu_part STREQUAL "0x800") - set(TARGET_ARCHITECTURE "falkor") - elseif(_cpu_part STREQUAL "0x801") - set(TARGET_ARCHITECTURE "kryo2") - elseif(_cpu_part STREQUAL "0xc00") - set(TARGET_ARCHITECTURE "falkor") - elseif(_cpu_part STREQUAL "0xc01") - set(TARGET_ARCHITECTURE "saphira") - endif() - - # Samsung - elseif(_cpu_implementer STREQUAL "0x53") - if(_cpu_part STREQUAL "0x001") - set(TARGET_ARCHITECTURE "exynos-m1") - endif() - - # Marvell - elseif(_cpu_implementer STREQUAL "0x56") - if(_cpu_part STREQUAL "0x131") - set(TARGET_ARCHITECTURE "marvell-f") - elseif(_cpu_part STREQUAL "0x581") - set(TARGET_ARCHITECTURE "marvell-pj4") - elseif(_cpu_part STREQUAL "0x584") - set(TARGET_ARCHITECTURE "marvell-pj4") - endif() - - # Apple - elseif(_cpu_implementer STREQUAL "0x61") - if(_cpu_part STREQUAL "0x022") - set(TARGET_ARCHITECTURE "icestorm") - elseif(_cpu_part STREQUAL "0x023") - set(TARGET_ARCHITECTURE "firestorm") - endif() - - # Faraday - elseif(_cpu_implementer STREQUAL "0x66") - if(_cpu_part STREQUAL "0x526") - set(TARGET_ARCHITECTURE "fa526") - elseif(_cpu_part STREQUAL "0x626") - set(TARGET_ARCHITECTURE "fa626") - endif() - - # Intel - elseif(_cpu_implementer STREQUAL "0x69") - if(_cpu_part STREQUAL "0x200") - set(TARGET_ARCHITECTURE "i80200") - elseif(_cpu_part STREQUAL "0x210") - set(TARGET_ARCHITECTURE "pxa250a") - elseif(_cpu_part STREQUAL "0x212") - set(TARGET_ARCHITECTURE "pxa210a") - elseif(_cpu_part STREQUAL "0x242") - set(TARGET_ARCHITECTURE "i80321-400") - elseif(_cpu_part STREQUAL "0x243") - set(TARGET_ARCHITECTURE "i80321-600") - elseif(_cpu_part STREQUAL "0x290") - set(TARGET_ARCHITECTURE "pxa250b") - elseif(_cpu_part STREQUAL "0x292") - set(TARGET_ARCHITECTURE "pxa210b") - elseif(_cpu_part STREQUAL "0x2c2") - set(TARGET_ARCHITECTURE "i80321-400-b0") - elseif(_cpu_part STREQUAL "0x2c3") - set(TARGET_ARCHITECTURE "i80321-600-b0") - elseif(_cpu_part STREQUAL "0x2d0") - set(TARGET_ARCHITECTURE "pxa250c") - elseif(_cpu_part STREQUAL "0x2d2") - set(TARGET_ARCHITECTURE "pxa210c") - elseif(_cpu_part STREQUAL "0x411") - set(TARGET_ARCHITECTURE "pxa27x") - elseif(_cpu_part STREQUAL "0x41c") - set(TARGET_ARCHITECTURE "ipx425-533") - elseif(_cpu_part STREQUAL "0x41d") - set(TARGET_ARCHITECTURE "ipx425-400") - elseif(_cpu_part STREQUAL "0x41f") - set(TARGET_ARCHITECTURE "ipx425-266") - elseif(_cpu_part STREQUAL "0x682") - set(TARGET_ARCHITECTURE "pxa32x") - elseif(_cpu_part STREQUAL "0x683") - set(TARGET_ARCHITECTURE "pxa930") - elseif(_cpu_part STREQUAL "0x688") - set(TARGET_ARCHITECTURE "pxa30x") - elseif(_cpu_part STREQUAL "0x689") - set(TARGET_ARCHITECTURE "pxa31x") - elseif(_cpu_part STREQUAL "0xb11") - set(TARGET_ARCHITECTURE "sa1110") - elseif(_cpu_part STREQUAL "0xc12") - set(TARGET_ARCHITECTURE "ipx1200") - endif() - - # Phytium - elseif(_cpu_implementer STREQUAL "0x70") - if(_cpu_part STREQUAL "0x662") - set(TARGET_ARCHITECTURE "ftc662") - elseif(_cpu_part STREQUAL "0x663") - set(TARGET_ARCHITECTURE "ftc663") - endif() - - # Ampere - elseif(_cpu_implementer STREQUAL "0xc0") - - # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h - elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 - if( _cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) - set(TARGET_ARCHITECTURE "apple-a6") - elseif(_cpu_part STREQUAL "0x37a09642" OR _cpu_part STREQUAL "933271106") # Cyclone (A7) - set(TARGET_ARCHITECTURE "apple-a7") - elseif(_cpu_part STREQUAL "0x2c91a47e" OR _cpu_part STREQUAL "747742334") # Typhoon (A8) - set(TARGET_ARCHITECTURE "apple-a8") - elseif(_cpu_part STREQUAL "0x92fb37c8" OR _cpu_part STREQUAL "2465937352") # Twister (A9) - set(TARGET_ARCHITECTURE "apple-a9") - elseif(_cpu_part STREQUAL "0x67ceee93" OR _cpu_part STREQUAL "1741614739") # Hurrican (A10) - set(TARGET_ARCHITECTURE "apple-a10") - elseif(_cpu_part STREQUAL "0xe81e7ef6" OR _cpu_part STREQUAL "3894312694") # Monsoon Mistral (A11) - set(TARGET_ARCHITECTURE "apple-a11") - elseif(_cpu_part STREQUAL "0x07d34b9f" OR _cpu_part STREQUAL "131287967") # Vortex Tempest (A12) - set(TARGET_ARCHITECTURE "apple-a12") - elseif(_cpu_part STREQUAL "0x462504d2" OR _cpu_part STREQUAL "1176831186") # Lightning Thunder (A13) - set(TARGET_ARCHITECTURE "apple-a13") - elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (A14 / M1 / M1 Pro / M1 Max) - set(TARGET_ARCHITECTURE "apple-m1") - elseif(_cpu_part STREQUAL "0xda33d83d" OR _cpu_part STREQUAL "3660830781") # Blizzard Avalanche (A15) - endif() - - else() - message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") - endif() -endmacro(OFA_AutodetectArm) - -macro(OFA_AutodetectPpc) - set(_cpu) - - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - - # Linux - file(READ "/proc/cpuinfo" _cpuinfo) - string(REGEX REPLACE ".*cpu[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu "${_cpuinfo}") - if(_cpu STREQUAL "POWER3") - set(TARGET_ARCHITECTURE "power3") - elseif(_cpu STREQUAL "POWER4") - set(TARGET_ARCHITECTURE "power4") - elseif(_cpu STREQUAL "POWER5") - set(TARGET_ARCHITECTURE "power5") - elseif(_cpu STREQUAL "POWER5+") - set(TARGET_ARCHITECTURE "power5+") - elseif(_cpu STREQUAL "POWER6") - set(TARGET_ARCHITECTURE "power6") - elseif(_cpu STREQUAL "POWER6X") - set(TARGET_ARCHITECTURE "power6x") - elseif(_cpu STREQUAL "POWER7") - set(TARGET_ARCHITECTURE "power7") - elseif(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") - set(TARGET_ARCHITECTURE "power8") - elseif(_cpu STREQUAL "POWER9" OR _cpu STREQUAL "POWER9NVL") - set(TARGET_ARCHITECTURE "power9") - elseif(_cpu STREQUAL "POWER10" OR _cpu STREQUAL "POWER10NVL") - set(TARGET_ARCHITECTURE "power10") - else() - message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") - endif() - - # TODO: AIX, FreeBSD, ... - - else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") - endif() -endmacro(OFA_AutodetectPpc) - macro(OFA_AutodetectHostArchitecture) set(TARGET_ARCHITECTURE "generic") set(OFA_ARCHITECTURE_FLAGS) + if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") OFA_AutodetectX86() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") OFA_AutodetectArm() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") -# OFA_AutodetectPpc() + OFA_AutodetectPpc() else() - message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture.cmake") + message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture") endif() endmacro(OFA_AutodetectHostArchitecture) #============================================================================= -# Handling of CPUs -# -# This is a two-step process: -# -# 1. Generate a list of compiler flags for the specific CPU -# -# 2. Special compiler-specific treatment of "native" flag -# -# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options -# -# 4. Set compiler-specific flags +# Handling of CPU options #============================================================================= -macro(OFA_HandleX86Options) - set(_march_flag_list) - set(_available_vector_units_list) - - # Define macros for Intel - macro(_nehalem) - list(APPEND _march_flag_list "nehalem") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") - endmacro() - macro(_westmere) - list(APPEND _march_flag_list "westmere") - _nehalem() - list(APPEND _available_vector_units_list "aes" "pclmul") - endmacro() - macro(_sandybridge) - list(APPEND _march_flag_list "sandybridge") - list(APPEND _march_flag_list "corei7-avx") - _westmere() - list(APPEND _available_vector_units_list "avx") - endmacro() - macro(_ivybridge) - list(APPEND _march_flag_list "ivybridge") - list(APPEND _march_flag_list "core-avx-i") - _sandybridge() - list(APPEND _available_vector_units_list "rdrnd" "f16c" "fsgsbase") - endmacro() - macro(_haswell) - list(APPEND _march_flag_list "haswell") - list(APPEND _march_flag_list "core-avx2") - _ivybridge() - list(APPEND _available_vector_units_list "avx2" "fma" "bmi" "bmi2") - endmacro() - macro(_broadwell) - list(APPEND _march_flag_list "broadwell") - _haswell() - list(APPEND _available_vector_units_list "rdseed" "adcx" "prefetchw") - endmacro() - macro(_skylake) - list(APPEND _march_flag_list "skylake") - _broadwell() - list(APPEND _available_vector_units_list "clflushopt" "xsavec" "xsaves") - endmacro() - macro(_skylake_avx512) - list(APPEND _march_flag_list "skylake-avx512") - _skylake() - list(APPEND _available_vector_units_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") - endmacro() - macro(_cascadelake) - list(APPEND _march_flag_list "cascadelake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512vnni") - endmacro() - macro(_cooperlake) - list(APPEND _march_flag_list "cooperlake") - _cascadelake() - list(APPEND _available_vector_units_list "avx512bf16") - endmacro() - macro(_cannonlake) - list(APPEND _march_flag_list "cannonlake") - _skylake() - list(APPEND _available_vector_units_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") - endmacro() - macro(_icelake) - list(APPEND _march_flag_list "icelake-client") - _cannonlake() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") - endmacro() - macro(_icelake_avx512) - list(APPEND _march_flag_list "icelake-server") - _icelake() - list(APPEND _available_vector_units_list "pconfig" "wbnoinvd") - endmacro() - macro(_tigerlake) - list(APPEND _march_flag_list "tigerlake") - _icelake() - list(APPEND _available_vector_units_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") - endmacro() - macro(_alderlake) - list(APPEND _march_flag_list "alderlake") - _broadwell() - list(APPEND _available_vector_units_list "avx-vnni" "cldemote" "clwb" "gfni-sse" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") - endmacro() - macro(_sapphirerapids) - list(APPEND _march_flag_list "sapphirerapids") - _skylake_avx512() - list(APPEND _available_vector_units_list "amx-bf16" "amx-int8" "amx-tile" "avx-vnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") - endmacro() - macro(_rocketlake) - list(APPEND _march_flag_list "rocketlake") - _skylake_avx512() - list(APPEND _available_vector_units_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") - endmacro() - macro(_knightslanding) - list(APPEND _march_flag_list "knl") - _broadwell() - list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd") - endmacro() - macro(_knightsmill) - list(APPEND _march_flag_list "knm") - _broadwell() - list(APPEND _available_vector_units_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") - endmacro() - macro(_silvermont) - list(APPEND _march_flag_list "silvermont") - _westmere() - list(APPEND _available_vector_units_list "rdrnd") - endmacro() - macro(_goldmont) - list(APPEND _march_flag_list "goldmont") - _silvermont() - list(APPEND _available_vector_units_list "rdseed") - endmacro() - macro(_goldmont_plus) - list(APPEND _march_flag_list "goldmont-plus") - _goldmont() - list(APPEND _available_vector_units_list "rdpid") - endmacro() - macro(_tremont) - list(APPEND _march_flag_list "tremont") - _goldmont_plus() - endmacro() - - # TODO: Define similar macros for AMD - - # Intel - if(TARGET_ARCHITECTURE STREQUAL "core") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "merom") - list(APPEND _march_flag_list "merom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "penryn") - list(APPEND _march_flag_list "penryn") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") - if(_cpu_flags MATCHES "sse4_1") - message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)") - list(APPEND _available_vector_units_list "sse4.1") - else() - message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)") - endif() - elseif(TARGET_ARCHITECTURE STREQUAL "knm") - _knightsmill() - elseif(TARGET_ARCHITECTURE STREQUAL "knl") - _knightslanding() - elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") - _rocketlake() - elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") - _sapphirerapids() - elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") - _alderlake() - elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") - _tigerlake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake") - _icelake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") - _icelake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") - _cannonlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") - _cooperlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") - _cascadelake() - elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") - _skylake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") - _broadwell() - elseif(TARGET_ARCHITECTURE STREQUAL "haswell") - _haswell() - elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") - _ivybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") - _sandybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "westmere") - _westmere() - elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") - _nehalem() - elseif(TARGET_ARCHITECTURE STREQUAL "tremont") - _tremont() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") - _goldmont_plus() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") - _goldmont() - elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") - _silvermont() - elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") - list(APPEND _march_flag_list "bonnell") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "atom") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3") - - # AMD - elseif(TARGET_ARCHITECTURE STREQUAL "k8") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2") - elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") - list(APPEND _march_flag_list "k8-sse3") - list(APPEND _march_flag_list "k8") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") - list(APPEND _march_flag_list "btver2") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen3") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen2") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_vector_units_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "excavator") - list(APPEND _march_flag_list "bdver4") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") - elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a") - - # Others - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() - - # Special treatment for "native" - if(TARGET_ARCHITECTURE STREQUAL "native") - if(MSVC) - # MSVC (on Windows) - message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" - OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") - if(WIN32) - # Intel (on Windows) - AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - else() - # Intel (on Linux) - AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" - OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") - # NVidia HPC / PGI (on Linux/Windows - AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") - # Sun/Oracle Studio (on Linux/Sun OS) - AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") - # Cray (on Linux) - message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") - else() - # Others: GNU, Clang and variants - AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - - # Apply architecture flags - elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - - # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) - message(STATUS "SSE disabled because of old/broken toolchain") - set(_sse_broken true) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - elseif(DEFINED OFA_AVX_INTRINSICS_BROKEN AND OFA_AVX_INTRINSICS_BROKEN) - message(STATUS "AVX disabled because of old/broken toolchain") - set(_sse_broken false) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - else() - set(_sse_broken false) - set(_avx_broken false) - if(DEFINED OFA_FMA4_INTRINSICS_BROKEN AND OFA_FMA4_INTRINSICS_BROKEN) - message(STATUS "FMA4 disabled because of old/broken toolchain") - set(_fma4_broken true) - else() - set(_fma4_broken false) - endif() - if(DEFINED OFA_XOP_INTRINSICS_BROKEN AND OFA_XOP_INTRINSICS_BROKEN) - message(STATUS "XOP disabled because of old/broken toolchain") - set(_xop_broken true) - else() - set(_xop_broken false) - endif() - if(DEFINED OFA_AVX2_INTRINSICS_BROKEN AND OFA_AVX2_INTRINSICS_BROKEN) - message(STATUS "AVX2 disabled because of old/broken toolchain") - set(_avx2_broken true) - else() - set(_avx2_broken false) - endif() - if(DEFINED OFA_AVX512_INTRINSICS_BROKEN AND OFA_AVX512_INTRINSICS_BROKEN) - message(STATUS "AVX512 disabled because of old/broken toolchain") - set(_avx512_broken true) - else() - set(_avx512_broken false) - endif() - endif() - - # Enable/disable macro - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - # Enable/disable features - _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) - _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) - _enable_or_disable(AVX512BF16 "avx512bf16" "Use AVX512BF16." _avx512_broken) - _enable_or_disable(AVX512BITALG "avx512bitalg" "Use AVX512BITALG." _avx512_broken) - _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." _avx512_broken) - _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." _avx512_broken) - _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." _avx512_broken) - _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." _avx512_broken) - _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." _avx512_broken) - _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." _avx512_broken) - _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." _avx512_broken) - _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." _avx512_broken) - _enable_or_disable(AVX512VBMI2 "avx512vbmi2" "Use AVX512VBMI2." _avx512_broken) - _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) - _enable_or_disable(AVX512VNNI "avx512vnni" "Use AVX512VNNI." _avx512_broken) - _enable_or_disable(AVX512VP2INTERSECT "avx512vp2intersect" "Use AVX512VP2INTERSECT." _avx512_broken) - _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq" "Use AVX512VPOPCNTDQ." _avx512_broken) - _enable_or_disable(BMI "bmi2" "Use BMI." _avx_broken) - _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) - _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) - _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) - _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." _sse_broken) - _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) - - # Add compiler flags - if(MSVC AND MSVC_VERSION GREATER 1700) - _my_find(_enable_vector_unit_list "avx512f" _found) - if(_found) - AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "avx2" _found) - if(_found) - AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "avx" _found) - if(_found) - AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "sse2" _found) - if(_found) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "sse" _found) - if(_found) - AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() - foreach(_flag ${_enable_vector_unit_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" - OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") - if(WIN32) - # Intel (on Windows) - set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") - set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") - set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") - set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") - set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") - set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") - set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") - set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") - set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") - set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") - set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") - set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") - set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") - set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") - set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") - set(OFA_map_westmere "-QxSSE4.2") - set(OFA_map_nehalem "-QxSSE4.2") - set(OFA_map_penryn "-QxSSSE3") - set(OFA_map_merom "-QxSSSE3") - set(OFA_map_core2 "-QxSSE3") - set(_ok FALSE) - else() - # Intel (in Linux) - set(OFA_map_knl "-xKNL;-xMIC-AVX512") - set(OFA_map_knm "-xKNM;-xMIC-AVX512") - set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") - set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") - set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") - set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") - set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") - set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") - set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") - set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") - set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") - set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") - set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") - set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) - endif() - foreach(arch ${_march_flag_list}) - if(DEFINED OFA_map_${arch}) - foreach(flag ${OFA_map_${arch}}) - AddCompilerFlag(${flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) - if(_ok) - break() - endif() - endforeach() - if(_ok) - break() - endif() - endif() - endforeach() - if(NOT _ok) - # This is the Intel compiler, so SSE2 is a very reasonable baseline. - message(STATUS "Did not recognize the requested architecture flag, falling back to SSE2") - if(WIN32) - AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - else() - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() - - # TODO PGI/Cray/SunPro ... - - else() - # Others: GNU, Clang and variants - - # Set -march flag - foreach(_flag ${_march_flag_list}) - AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - - # Set -mfeature flag for enabled features - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-m${_flag}" CXX_RESULT _result) - if(_result) - set(_header FALSE) - if(_flag STREQUAL "sse3") - set(_header "pmmintrin.h") - elseif(_flag STREQUAL "ssse3") - set(_header "tmmintrin.h") - elseif(_flag STREQUAL "sse4.1") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4.2") - set(_header "nmmintrin.h") - elseif(_flag STREQUAL "sse4a") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "avx") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx2") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx512*") - set(_header "immintrin.h") - elseif(_flag STREQUAL "fma4") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "xop") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "bmi") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "bmi2") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "rdrnd") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdpid") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdseed") - set(_header "immintrin.h") - endif() - set(_resultVar "HAVE_${_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - if(_header) - CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "disabling ${_useVar} because ${_header} is missing") - set(${_useVar} FALSE) - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endif() - if(NOT _header OR ${_resultVar}) - list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") - endif() - endif() - endforeach(_flag) - - # Set -mno-feature flag for disabled features - foreach(_flag ${_disable_vector_unit_list}) - AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - endif() - endif() - - # Compile code with profiling instrumentation - if(TARGET_PROFILER STREQUAL "gprof") - AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(TARGET_PROFILER STREQUAL "vtune") - if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") - # Need to check if this also works on Windows - AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() -endmacro(OFA_HandleX86Options) - -macro(OFA_HandleArmOptions) - set(_march_flag_list) - set(_mtune_flag_list) - set(_available_vector_units_list) - - # ARM - if(TARGET_ARCHITECTURE STREQUAL "strongarm") - list(APPEND _mtune_flag_list "strongarm") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm8") - list(APPEND _mtune_flag_list "arm8") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm810") - list(APPEND _mtune_flag_list "arm810") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa526") - list(APPEND _mtune_flag_list "fa526") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626") - list(APPEND _mtune_flag_list "fa626") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") - list(APPEND _mtune_flag_list "arm7tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") - list(APPEND _mtune_flag_list "arm7tdmi-s") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") - list(APPEND _mtune_flag_list "arm710t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") - list(APPEND _mtune_flag_list "arm720t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") - list(APPEND _mtune_flag_list "arm740t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9") - list(APPEND _mtune_flag_list "arm9") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") - list(APPEND _mtune_flag_list "arm9tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920") - list(APPEND _mtune_flag_list "arm920") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") - list(APPEND _mtune_flag_list "arm920t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") - list(APPEND _mtune_flag_list "arm922t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") - list(APPEND _mtune_flag_list "arm940t") - list(APPEND _march_flag_list "armv4t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") - list(APPEND _mtune_flag_list "arm1020t") - list(APPEND _march_flag_list "armv5t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") - list(APPEND _mtune_flag_list "arm10tdmi") - list(APPEND _march_flag_list "armv5t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") - list(APPEND _mtune_flag_list "arm9e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") - list(APPEND _mtune_flag_list "arm946e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") - list(APPEND _mtune_flag_list "arm966e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") - list(APPEND _mtune_flag_list "arm968e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") - list(APPEND _mtune_flag_list "arm10e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") - list(APPEND _mtune_flag_list "arm1020e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") - list(APPEND _mtune_flag_list "arm1022e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "xscale") - list(APPEND _mtune_flag_list "xscale") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") - list(APPEND _mtune_flag_list "iwmmxt") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") - list(APPEND _mtune_flag_list "iwmmxt2") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") - list(APPEND _mtune_flag_list "fa606te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") - list(APPEND _mtune_flag_list "fa626te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") - list(APPEND _mtune_flag_list "fmp626") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") - list(APPEND _mtune_flag_list "fa726te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") - list(APPEND _mtune_flag_list "arm926ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") - list(APPEND _mtune_flag_list "arm1026ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") - list(APPEND _mtune_flag_list "mpcore") - list(APPEND _march_flag_list "armv6k") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") - list(APPEND _mtune_flag_list "arm1136j-s") - list(APPEND _march_flag_list "armv6j") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") - list(APPEND _mtune_flag_list "arm1136jf-s") - list(APPEND _march_flag_list "armv6j") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") - list(APPEND _mtune_flag_list "arm1156t2-s") - list(APPEND _march_flag_list "armv6t2") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") - list(APPEND _mtune_flag_list "arm1156t2f-s") - list(APPEND _march_flag_list "armv6t2") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") - list(APPEND _mtune_flag_list "arm1176jz-s") - list(APPEND _march_flag_list "armv6kz") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") - list(APPEND _mtune_flag_list "arm1176jzf-s") - list(APPEND _march_flag_list "armv6kz") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") - list(APPEND _mtune_flag_list "generic-armv7-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") - list(APPEND _mtune_flag_list "cortex-a78") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") - list(APPEND _mtune_flag_list "cortex-a5") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") - list(APPEND _mtune_flag_list "cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") - list(APPEND _mtune_flag_list "cortex-a8") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "sec" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") - list(APPEND _mtune_flag_list "cortex-a9") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") - list(APPEND _mtune_flag_list "cortex-a12") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") - list(APPEND _mtune_flag_list "cortex-a15") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") - list(APPEND _mtune_flag_list "cortex-a17") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") - list(APPEND _mtune_flag_list "cortex-a32") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") - list(APPEND _mtune_flag_list "cortex-a34") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") - list(APPEND _mtune_flag_list "cortex-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") - list(APPEND _mtune_flag_list "cortex-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") - list(APPEND _mtune_flag_list "cortex-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") - list(APPEND _mtune_flag_list "cortex-a57") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") - list(APPEND _mtune_flag_list "cortex-a72") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") - list(APPEND _mtune_flag_list "cortex-a73") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") - list(APPEND _mtune_flag_list "cortex-a75") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") - list(APPEND _mtune_flag_list "cortex-a76") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") - list(APPEND _mtune_flag_list "cortex-a76ae") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") - list(APPEND _mtune_flag_list "cortex-a77") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") - list(APPEND _mtune_flag_list "cortex-a78") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") - list(APPEND _mtune_flag_list "cortex-a78ae") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") - list(APPEND _mtune_flag_list "cortex-a78c") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") - list(APPEND _mtune_flag_list "cortex-a510") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") - list(APPEND _mtune_flag_list "cortex-a710") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") - list(APPEND _mtune_flag_list "cortex-m0") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") - list(APPEND _mtune_flag_list "cortex-m0plus") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") - list(APPEND _mtune_flag_list "cortex-m1") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") - list(APPEND _mtune_flag_list "cortex-m3") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") - list(APPEND _mtune_flag_list "cortex-m4") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") - list(APPEND _mtune_flag_list "cortex-m7") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp.dp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") - list(APPEND _mtune_flag_list "cortex-m23") - list(APPEND _march_flag_list "armv8-m.base") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") - list(APPEND _mtune_flag_list "cortex-m33") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") - list(APPEND _mtune_flag_list "cortex-m35p") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") - list(APPEND _mtune_flag_list "cortex-m55") - list(APPEND _march_flag_list "armv8.1-m.main") - list(APPEND _march_flag_list "armv8-m") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "mve.fp" "fp.dp") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") - list(APPEND _mtune_flag_list "cortex-r4") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") - list(APPEND _mtune_flag_list "cortex-r4f") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") - list(APPEND _mtune_flag_list "cortex-r5") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") - list(APPEND _mtune_flag_list "cortex-r7") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") - list(APPEND _mtune_flag_list "cortex-r8") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") - list(APPEND _mtune_flag_list "cortex-r52") - list(APPEND _march_flag_list "armv8-r") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") - list(APPEND _mtune_flag_list "cortex-x1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") - list(APPEND _mtune_flag_list "neoverse-e1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") - list(APPEND _mtune_flag_list "neoverse-n1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") - list(APPEND _mtune_flag_list "neoverse-n2") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") - list(APPEND _mtune_flag_list "neoverse-v1") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - # Broadcom - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") - list(APPEND _mtune_flag_list "brahma-b15") - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") - list(APPEND _mtune_flag_list "brahma-b53") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") - list(APPEND _mtune_flag_list "thunderx2") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto") - - # Cavium - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") - list(APPEND _mtune_flag_list "thunderx") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") - list(APPEND _mtune_flag_list "thunderxt88") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") - list(APPEND _mtune_flag_list "thunderxt81") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") - list(APPEND _mtune_flag_list "thunderxt83") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") - list(APPEND _mtune_flag_list "thunderx2t99") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - - # DEC - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") - list(APPEND _mtune_flag_list "strongarm110") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") - list(APPEND _mtune_flag_list "strongarm1100") - list(APPEND _march_flag_list "armv4") - - # FUJITSU - elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") - list(APPEND _mtune_flag_list "a64fx") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "sve") - - # HiSilicon - elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") - list(APPEND _mtune_flag_list "tsv110") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - - # Nvidia - elseif(TARGET_ARCHITECTURE STREQUAL "denver") - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "denver2") - list(APPEND _mtune_flag_list "denver2") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "carmel") - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - # APM - elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") - list(APPEND _mtune_flag_list "xgene1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - - # Qualcomm - elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") - list(APPEND _mtune_flag_list "scorpion") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "krait") - list(APPEND _mtune_flag_list "krait") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo") - list(APPEND _mtune_flag_list "kryo") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") - list(APPEND _mtune_flag_list "kryo2") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "falkor") - list(APPEND _mtune_flag_list "falkor") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "saphira") - list(APPEND _mtune_flag_list "saphira") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - # Samsung - elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") - list(APPEND _mtune_flag_list "exynos-m1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto" "simd") - - # Marvell - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") - list(APPEND _mtune_flag_list "marvell-f") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") - list(APPEND _mtune_flag_list "marvell-pj4") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "fp") - - # Intel - elseif(TARGET_ARCHITECTURE STREQUAL "i80200") - list(APPEND _mtune_flag_list "i80200") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") - list(APPEND _mtune_flag_list "pxa250a") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") - list(APPEND _mtune_flag_list "pxa210a") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") - list(APPEND _mtune_flag_list "i80321-400") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") - list(APPEND _mtune_flag_list "i80321-600") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") - list(APPEND _mtune_flag_list "pxa250b") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") - list(APPEND _mtune_flag_list "pxa210b") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") - list(APPEND _mtune_flag_list "i80321-400-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") - list(APPEND _mtune_flag_list "i80321-600-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") - list(APPEND _mtune_flag_list "pxa250c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") - list(APPEND _mtune_flag_list "pxa210c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") - list(APPEND _mtune_flag_list "pxa27x") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") - list(APPEND _mtune_flag_list "ipx425-533") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") - list(APPEND _mtune_flag_list "ipx425-400") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") - list(APPEND _mtune_flag_list "ipx425-266") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") - list(APPEND _mtune_flag_list "pxa32x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") - list(APPEND _mtune_flag_list "pxa930") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") - list(APPEND _mtune_flag_list "pxa30x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") - list(APPEND _mtune_flag_list "pxa31x") - elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") - list(APPEND _mtune_flag_list "sa1110") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") - list(APPEND _mtune_flag_list "ipx1200") - - # Apple - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") - list(APPEND _mtune_flag_list "apple-a6") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") - list(APPEND _mtune_flag_list "apple-a7") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") - list(APPEND _mtune_flag_list "apple-a8") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") - list(APPEND _mtune_flag_list "apple-a9") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") - list(APPEND _mtune_flag_list "apple-a10") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") - list(APPEND _mtune_flag_list "apple-a11") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") - list(APPEND _mtune_flag_list "apple-a12") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") - list(APPEND _mtune_flag_list "apple-a13") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") - list(APPEND _mtune_flag_list "vortex") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - - # Others - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() - - # Special treatment for "native" - if(TARGET_ARCHITECTURE STREQUAL "native") - if(MSVC) - # MSVC (on Windows) - message(FATAL_ERROR, "MSVC does not support \"native\" flag.") - elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" - OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") - # NVidia HPC / PGI (on Linux/Windows - AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") - # Cray (on Linux) - message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") - else() - # Others: GNU, Clang and variants - AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - - # Apply architecture flags - elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - - # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - # TODO: Add OFA_xxx_INTRINSICS_BROKEN rules - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - # Enable/disable macro - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - # Enable/disable features - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - - # Add compiler flags - if(MSVC AND MSVC_VERSION GREATER 1900) - _my_find(_enable_vector_unit_list "vfpv4" _found) - if(_found) - AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - if(NOT _found) - _my_find(_enable_vector_unit_list "simd" _found) - if(_found) - AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - endif() - foreach(_flag ${_enable_vector_unit_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") - - # TODO: Add Cray flags - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") - - # TODO: Add Fujitsu flags - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") - - # TODO: Add NVHPC flags - - else() - # Others: GNU, Clang and variants - - # Following the recommendation from - # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu - # we first try to only use the -mcpu flag. If that fails, e.g., if - # the compiler does not yet support the specified target, we try to - # set the -march and -mtune flags as fallback option. - foreach(_flag ${_mtune_flag_list}) - AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) - - # Fallback: set -march, -mtune flags - if(NOT _good) - # Set -march flag - foreach(_march ${_march_flag_list}) - AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) - set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") - endif(_good) - endforeach(_flag) - AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - break() - endif(_good) - endforeach(_march) - - # Set -mtune flag - foreach(_mtune ${_mtune_flag_list}) - AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_mtune) - endif(NOT _good) - - # Note that ARM does not support -mfeature and -mno-feature to - # enable and disable specific features. Hence, there are no - # loops over the _enable_vector_unit_list and - # _disable_vector_unit_list lists here(!) - endif() - endif() - - # Compile code with profiling instrumentation - if(TARGET_PROFILER STREQUAL "gprof") - AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() -endmacro(OFA_HandleArmOptions) - -macro(OFA_HandlePpcOptions) - set(_march_flag_list) - set(_available_vector_units_list) - - # Define macros for PowerPC64 - macro(_power3) - list(APPEND _march_flag_list "power3") - endmacro() - macro(_power4) - list(APPEND _march_flag_list "power4") - _power3() - endmacro() - macro(_power5) - list(APPEND _march_flag_list "power5") - _power4() - endmacro() - macro(_power5plus) - list(APPEND _march_flag_list "power5+") - _power5() - endmacro() - macro(_power6) - list(APPEND _march_flag_list "power6") - _power5() - endmacro() - macro(_power6x) - list(APPEND _march_flag_list "power6x") - _power6() - endmacro() - macro(_power7) - list(APPEND _march_flag_list "power7") - _power6() - endmacro() - macro(_power8) - list(APPEND _march_flag_list "power8") - list(APPEND _march_flag_list "pwr8") - _power7() - endmacro() - macro(_power9) - list(APPEND _march_flag_list "power9") - list(APPEND _march_flag_list "pwr9") - _power8() - endmacro() - macro(_power10) - list(APPEND _march_flag_list "power10") - list(APPEND _march_flag_list "pwr10") - _power9() - endmacro() - - # PowerPC64 - if(TARGET_ARCHITECTURE STREQUAL "power3") - _power3() - elseif(TARGET_ARCHITECTURE STREQUAL "power4") - _power4() - elseif(TARGET_ARCHITECTURE STREQUAL "power5") - _power5() - elseif(TARGET_ARCHITECTURE STREQUAL "power5+") - _power5plus() - elseif(TARGET_ARCHITECTURE STREQUAL "power6") - _power6() - elseif(TARGET_ARCHITECTURE STREQUAL "power6x") - _power6x() - elseif(TARGET_ARCHITECTURE STREQUAL "power7") - _power7() - elseif(TARGET_ARCHITECTURE STREQUAL "power8") - _power8() - elseif(TARGET_ARCHITECTURE STREQUAL "power9") - _power9() - elseif(TARGET_ARCHITECTURE STREQUAL "power10") - _power10() - - # Others - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() - - # Special treatment for "native" - if(TARGET_ARCHITECTURE STREQUAL "native") - - # Apply architecture flags - elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - - # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - # Enable/disable macro - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _my_find(_available_vector_units_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") - endif() - endmacro() - - # Enable/disable features - - # Add compiler flags - if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "XL") - - else() - # Others: GNU, Clang and variants - - - endif() - endif() -endmacro(OFA_HandlePpcOptions) - macro(OptimizeForArchitecture) + message(STATUS "Optimizing for target architecture") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") - set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) + set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"core2\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"a64fx\", \"apple-a6\", \"apple-a7\", \"apple-a8\", \"apple-a9\", \"apple-a10\", \"apple-a11\", \"apple-a12\", \"apple-a13\", \"apple-m1\", \"arm1020e\", \"arm1020t\", \"arm1022e\", \"arm1026ej-s\", \"arm10e\", \"arm10tdmi\", \"arm1136j-s\", \"arm1136jf-s\", \"arm1156t2-s\", \"arm1156t2f-s\", \"arm1176jz-s\", \"arm1176jzf-s\", \"arm710t\", \"arm720t\", \"arm740t\", \"arm7tdmi-s\", \"arm7tdmi\", \"arm810\", \"arm8\", \"arm920\", \"arm920t\", \"arm922t\", \"arm926ej-s\", \"arm940t\", \"arm946e-s\", \"arm966e-s\", \"arm968e-s\", \"arm9\", \"arm9e\", \"arm9tdmi\", \"brahma-b15\", \"brahma-b53\", \"carmel\", \"cortex-a7\", \"cortex-a8\", \"cortex-a9\", \"cortex-a12\", \"cortex-a15.cortex-a7\", \"cortex-a15\", \"cortex-a17.cortex-a7\", \"cortex-a17\", \"cortex-a32\", \"cortex-a34\", \"cortex-a35\", \"cortex-a53\", \"cortex-a55\", \"cortex-a57.cortext-a53\", \"cortex-a57\", \"cortex-a5\", \"cortex-a72.cortext-a53\", \"cortex-a72\", \"cortex-a73.cortext-a35\", \"cortex-a73.cortext-a53\", \"cortex-a73\", \"cortex-a75.cortext-a55\", \"cortex-a75\", \"cortex-a76.cortext-a55\", \"cortex-a76\", \"cortex-a76ae\", \"cortex-a77\", \"cortex-a78\", \"cortex-a78ae\", \"cortex-a76c\", \"cortex-a510\", \"cortex-a710\", \"cortex-m0\", \"cortex-m0plus\", \"cortex-m1\", \"cortex-m23\", \"cortex-m33\", \"cortex-m35p\", \"cortex-m3\", \"cortex-m4\", \"cortex-m55\", \"cortex-m7\", \"cortex-r4\", \"cortex-r4f\", \"cortex-r52\", \"cortex-r5\", \"cortex-r7\", \"cortex-r8\", \"cortex-x1\", \"cortex-x2\", \"denver2\", \"denver\", \"exynos-m1\", \"fa526\", \"fa606te\", \"fa626\", \"fa626te\", \"fa726te\", \"falkor\", \"fmp626\", \"generic-armv7-a\", \"i80200\", \"i80321-400-b0\", \"i80321-400\", \"i80321-600-b0\", \"i80321-600\", \"ipx1200\", \"ipx425-266\", \"ipx425-400\", \"ipx425-533\", \"iwmmxt2\", \"iwmmxt\", \"krait\", \"kryo2\", \"kryo\", \"marvell-f\", \"marvell-pj4\", \"mpcore\", \"neoverse-e1\", \"neoverse-n1\", \"neoverse-n2\", \"neoverse-v1\", \"pxa210a\", \"pxa210b\", \"pxa210c\", \"pxa250a\", \"pxa250b\", \"pxa250c\", \"pxa27x\", \"pxa30x\", \"pxa31x\", \"pxa32x\", \"pxa930\", \"sa1110\", \"saphira\", \"scorpion\", \"strongarm1100\", \"strongarm110\", \"strongarm\", \"thunderx2\", \"thunderx2t99\", \"thunderx\", \"thunderxt81\", \"thunderxt83\", \"thunderxt88\", \"tsv110\", \"xgene1\", \"xscale\".") elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Other supported values are: \"none\", \"generic\", \"power8\", \"power9\", \"power10\".") else() - message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture.cmake") + message(WARNING "The CMAKE_SYSTEM_PROCESSOR '${CMAKE_SYSTEM_PROCESSOR}' is not supported by OptimizeForArchitecture") + endif() + + if(NOT OFA_VERBOSE) + set(CMAKE_REQUIRED_QUIET true) endif() + set(_force) if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}") message(STATUS "Target architecture changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"") @@ -2565,4 +138,6 @@ macro(OptimizeForArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_HandlePpcOptions() endif() + + message(STATUS "Optimizing for target architecture - done") endmacro(OptimizeForArchitecture) diff --git a/cmake/AddCompilerFlag.cmake b/cmake/ofa/AddCompilerFlag.cmake similarity index 96% rename from cmake/AddCompilerFlag.cmake rename to cmake/ofa/AddCompilerFlag.cmake index a2cf7b5c79..5d989b16e6 100644 --- a/cmake/AddCompilerFlag.cmake +++ b/cmake/ofa/AddCompilerFlag.cmake @@ -34,11 +34,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #============================================================================= -get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH) -include("${_currentDir}/CheckCCompilerFlag.cmake") -include("${_currentDir}/CheckCXXCompilerFlag.cmake") -include("${_currentDir}/CheckMicCCompilerFlag.cmake") -include("${_currentDir}/CheckMicCXXCompilerFlag.cmake") +include(ofa/CheckCCompilerFlag) +include(ofa/CheckCXXCompilerFlag) +include(ofa/CheckMicCCompilerFlag) +include(ofa/CheckMicCXXCompilerFlag) macro(AddCompilerFlag _flag) string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}") diff --git a/cmake/ofa/AutodetectArm.cmake b/cmake/ofa/AutodetectArm.cmake new file mode 100644 index 0000000000..9285d3ab1e --- /dev/null +++ b/cmake/ofa/AutodetectArm.cmake @@ -0,0 +1,390 @@ +#============================================================================= +# Autodetection of ARM / ARM64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectArm) + set(_cpu_implementer) + set(_cpu_architecture) + set(_cpu_variant) + set(_cpu_part) + set(_cpu_revision) + + # Get CPUID from system + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*CPU implementer[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_implementer "${_cpuinfo}") + string(REGEX REPLACE ".*CPU architecture[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_architecture "${_cpuinfo}") + string(REGEX REPLACE ".*CPU variant[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_variant "${_cpuinfo}") + string(REGEX REPLACE ".*CPU part[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_part "${_cpuinfo}") + string(REGEX REPLACE ".*CPU revision[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_revision "${_cpuinfo}") + string(REGEX REPLACE ".*Features[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + exec_program("/usr/sbin/sysctl -n hw.cputype hw.cputype hw.cpusubtype hw.cpufamily hw.cpusubfamily" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_architecture) + list(GET _sysctl_output 2 _cpu_variant) + list(GET _sysctl_output 3 _cpu_part) + list(GET _sysctl_output 4 _cpu_revision) + endif() + if(_error) + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + endif() + + # TODO: Windows, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + + # Determine CPU from CPUID + # Taken from https://github.com/karelzak/util-linux/blob/master/sys-utils/lscpu-arm.c + # and https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html + + # ARM + if(_cpu_implementer STREQUAL "0x41") + if(_cpu_part STREQUAL "0x810") + set(TARGET_ARCHITECTURE "arm810") + elseif(_cpu_part STREQUAL "0x920") + set(TARGET_ARCHITECTURE "arm920t") + elseif(_cpu_part STREQUAL "0x922") + set(TARGET_ARCHITECTURE "arm922t") + elseif(_cpu_part STREQUAL "0x926") + set(TARGET_ARCHITECTURE "arm926ej-s") + elseif(_cpu_part STREQUAL "0x940") + set(TARGET_ARCHITECTURE "arm940t") + elseif(_cpu_part STREQUAL "0x946") + set(TARGET_ARCHITECTURE "arm946e-s") + elseif(_cpu_part STREQUAL "0x966") + set(TARGET_ARCHITECTURE "arm966e-s") + elseif(_cpu_part STREQUAL "0xa20") + set(TARGET_ARCHITECTURE "arm1020e") + elseif(_cpu_part STREQUAL "0xa22") + set(TARGET_ARCHITECTURE "arm1022e") + elseif(_cpu_part STREQUAL "0xa26") + set(TARGET_ARCHITECTURE "arm1026ej-s") + elseif(_cpu_part STREQUAL "0xb02") + set(TARGET_ARCHITECTURE "mpcore") + elseif(_cpu_part STREQUAL "0xb36") + set(TARGET_ARCHITECTURE "arm1136jf-s") + elseif(_cpu_part STREQUAL "0xb56") + set(TARGET_ARCHITECTURE "arm1156t2f-s") + elseif(_cpu_part STREQUAL "0xb76") + set(TARGET_ARCHITECTURE "arm1176jzf-s") + elseif(_cpu_part STREQUAL "0xc05") + set(TARGET_ARCHITECTURE "cortex-a5") + elseif(_cpu_part STREQUAL "0xc07") + set(TARGET_ARCHITECTURE "cortex-a7") + elseif(_cpu_part STREQUAL "0xc08") + set(TARGET_ARCHITECTURE "cortex-a8") + elseif(_cpu_part STREQUAL "0xc09") + set(TARGET_ARCHITECTURE "cortex-a9") + elseif(_cpu_part STREQUAL "0xc0d") + set(TARGET_ARCHITECTURE "cortex-a12") + elseif(_cpu_part STREQUAL "0xc0f") + set(TARGET_ARCHITECTURE "cortex-a15") + elseif(_cpu_part STREQUAL "0xc0e") + set(TARGET_ARCHITECTURE "cortex-a17") + elseif(_cpu_part STREQUAL "0xc14") + set(TARGET_ARCHITECTURE "cortex-r4f") + elseif(_cpu_part STREQUAL "0xc15") + set(TARGET_ARCHITECTURE "cortex-r5") + elseif(_cpu_part STREQUAL "0xc17") + set(TARGET_ARCHITECTURE "cortex-r7") + elseif(_cpu_part STREQUAL "0xc18") + set(TARGET_ARCHITECTURE "cortex-r8") + elseif(_cpu_part STREQUAL "0xc20") + set(TARGET_ARCHITECTURE "cortex-m0") + elseif(_cpu_part STREQUAL "0xc21") + set(TARGET_ARCHITECTURE "cortex-m1") + elseif(_cpu_part STREQUAL "0xc23") + set(TARGET_ARCHITECTURE "cortex-m3") + elseif(_cpu_part STREQUAL "0xc24") + set(TARGET_ARCHITECTURE "cortex-m4") + elseif(_cpu_part STREQUAL "0xc27") + set(TARGET_ARCHITECTURE "cortex-m7") + elseif(_cpu_part STREQUAL "0xc60") + set(TARGET_ARCHITECTURE "cortex-m0plus") + elseif(_cpu_part STREQUAL "0xd01") + set(TARGET_ARCHITECTURE "cortex-a32") + elseif(_cpu_part STREQUAL "0xd02") + set(TARGET_ARCHITECTURE "cortex-a34") + elseif(_cpu_part STREQUAL "0xd03") + set(TARGET_ARCHITECTURE "cortex-a53") + elseif(_cpu_part STREQUAL "0xd04") + set(TARGET_ARCHITECTURE "cortex-a35") + elseif(_cpu_part STREQUAL "0xd05") + set(TARGET_ARCHITECTURE "cortex-a55") + elseif(_cpu_part STREQUAL "0xd07") + set(TARGET_ARCHITECTURE "cortex-a57") + elseif(_cpu_part STREQUAL "0xd08") + set(TARGET_ARCHITECTURE "cortex-a72") + elseif(_cpu_part STREQUAL "0xd09") + set(TARGET_ARCHITECTURE "cortex-a73") + elseif(_cpu_part STREQUAL "0xd0a") + set(TARGET_ARCHITECTURE "cortex-a75") + elseif(_cpu_part STREQUAL "0xd0b") + set(TARGET_ARCHITECTURE "cortex-a76") + elseif(_cpu_part STREQUAL "0xd0c") + set(TARGET_ARCHITECTURE "neoverse-n1") + elseif(_cpu_part STREQUAL "0xd0d") + set(TARGET_ARCHITECTURE "cortex-a77") + elseif(_cpu_part STREQUAL "0xd0e") + set(TARGET_ARCHITECTURE "cortex-a76ae") + elseif(_cpu_part STREQUAL "0xd13") + set(TARGET_ARCHITECTURE "cortex-r52") + elseif(_cpu_part STREQUAL "0xd20") + set(TARGET_ARCHITECTURE "cortex-m23") + elseif(_cpu_part STREQUAL "0xd21") + set(TARGET_ARCHITECTURE "cortex-m33") + elseif(_cpu_part STREQUAL "0xd40") + set(TARGET_ARCHITECTURE "neoverse-v1") + elseif(_cpu_part STREQUAL "0xd41") + set(TARGET_ARCHITECTURE "cortex-a78") + elseif(_cpu_part STREQUAL "0xd42") + set(TARGET_ARCHITECTURE "cortex-a78ae") + elseif(_cpu_part STREQUAL "0xd44") + set(TARGET_ARCHITECTURE "cortex-x1") + elseif(_cpu_part STREQUAL "0xd46") + set(TARGET_ARCHITECTURE "cortex-a510") + elseif(_cpu_part STREQUAL "0xd47") + set(TARGET_ARCHITECTURE "cortex-a710") + elseif(_cpu_part STREQUAL "0xd48") + set(TARGET_ARCHITECTURE "cortex-x2") + elseif(_cpu_part STREQUAL "0xd49") + set(TARGET_ARCHITECTURE "neoverse-n2") + elseif(_cpu_part STREQUAL "0xd4a") + set(TARGET_ARCHITECTURE "neoverse-e1") + elseif(_cpu_part STREQUAL "0xd4b") + set(TARGET_ARCHITECTURE "cortex-a78c") + endif() + + # Broadcom + elseif(_cpu_implementer STREQUAL "0x42") + if(_cpu_part STREQUAL "0x0f") + set(TARGET_ARCHITECTURE "brahma-b15") + elseif(_cpu_part STREQUAL "0x100") + set(TARGET_ARCHITECTURE "brahma-b53") + elseif(_cpu_part STREQUAL "0x516") + set(TARGET_ARCHITECTURE "thunderx2") + endif() + + # Cavium + elseif(_cpu_implementer STREQUAL "0x43") + if(_cpu_part STREQUAL "0x0a0") + set(TARGET_ARCHITECTURE "thunderx") + elseif(_cpu_part STREQUAL "0x0a1") + set(TARGET_ARCHITECTURE "thunderxt88") + elseif(_cpu_part STREQUAL "0x0a2") + set(TARGET_ARCHITECTURE "thunderxt81") + elseif(_cpu_part STREQUAL "0x0a3") + set(TARGET_ARCHITECTURE "thunderxt83") + elseif(_cpu_part STREQUAL "0x0af") + set(TARGET_ARCHITECTURE "thunderx2t99") + endif() + + # DEC + elseif(_cpu_implementer STREQUAL "0x44") + if(_cpu_part STREQUAL "0xa10") + set(TARGET_ARCHITECTURE "strongarm110") + elseif(_cpu_part STREQUAL "0xa11") + set(TARGET_ARCHITECTURE "strongarm1100") + endif() + + # FUJITSU + elseif(_cpu_implementer STREQUAL "0x46") + if(_cpu_part STREQUAL "0x001") + set(TARGET_ARCHITECTURE "a64fx") + endif() + + # HiSilicon + elseif(_cpu_implementer STREQUAL "0x48") + if(_cpu_part STREQUAL "0xd01") + set(TARGET_ARCHITECTURE "tsv110") + endif() + + # Infineon + elseif(_cpu_implementer STREQUAL "0x49") + + # Motorola/Freescale + elseif(_cpu_implementer STREQUAL "0x4d") + + # Nvidia + elseif(_cpu_implementer STREQUAL "0x4e") + if(_cpu_part STREQUAL "0x000") + set(TARGET_ARCHITECTURE "denver") + elseif(_cpu_part STREQUAL "0x003") + set(TARGET_ARCHITECTURE "denver2") + elseif(_cpu_part STREQUAL "0x004") + set(TARGET_ARCHITECTURE "carmel") + endif() + + # APM + elseif(_cpu_implementer STREQUAL "0x50") + if(_cpu_part STREQUAL "0x000") + set(TARGET_ARCHITECTURE "xgene1") + endif() + + # Qualcomm + elseif(_cpu_implementer STREQUAL "0x51") + if(_cpu_part STREQUAL "0x00f") + set(TARGET_ARCHITECTURE "scorpion") + elseif(_cpu_part STREQUAL "0x02d") + set(TARGET_ARCHITECTURE "scorpion") + elseif(_cpu_part STREQUAL "0x04d") + set(TARGET_ARCHITECTURE "krait") + elseif(_cpu_part STREQUAL "0x06f") + set(TARGET_ARCHITECTURE "krait") + elseif(_cpu_part STREQUAL "0x201") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x205") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x211") + set(TARGET_ARCHITECTURE "kryo") + elseif(_cpu_part STREQUAL "0x800") + set(TARGET_ARCHITECTURE "falkor") + elseif(_cpu_part STREQUAL "0x801") + set(TARGET_ARCHITECTURE "kryo2") + elseif(_cpu_part STREQUAL "0xc00") + set(TARGET_ARCHITECTURE "falkor") + elseif(_cpu_part STREQUAL "0xc01") + set(TARGET_ARCHITECTURE "saphira") + endif() + + # Samsung + elseif(_cpu_implementer STREQUAL "0x53") + if(_cpu_part STREQUAL "0x001") + set(TARGET_ARCHITECTURE "exynos-m1") + endif() + + # Marvell + elseif(_cpu_implementer STREQUAL "0x56") + if(_cpu_part STREQUAL "0x131") + set(TARGET_ARCHITECTURE "marvell-f") + elseif(_cpu_part STREQUAL "0x581") + set(TARGET_ARCHITECTURE "marvell-pj4") + elseif(_cpu_part STREQUAL "0x584") + set(TARGET_ARCHITECTURE "marvell-pj4") + endif() + + # Apple + elseif(_cpu_implementer STREQUAL "0x61") + if(_cpu_part STREQUAL "0x022") + set(TARGET_ARCHITECTURE "icestorm") + elseif(_cpu_part STREQUAL "0x023") + set(TARGET_ARCHITECTURE "firestorm") + endif() + + # Faraday + elseif(_cpu_implementer STREQUAL "0x66") + if(_cpu_part STREQUAL "0x526") + set(TARGET_ARCHITECTURE "fa526") + elseif(_cpu_part STREQUAL "0x626") + set(TARGET_ARCHITECTURE "fa626") + endif() + + # Intel + elseif(_cpu_implementer STREQUAL "0x69") + if(_cpu_part STREQUAL "0x200") + set(TARGET_ARCHITECTURE "i80200") + elseif(_cpu_part STREQUAL "0x210") + set(TARGET_ARCHITECTURE "pxa250a") + elseif(_cpu_part STREQUAL "0x212") + set(TARGET_ARCHITECTURE "pxa210a") + elseif(_cpu_part STREQUAL "0x242") + set(TARGET_ARCHITECTURE "i80321-400") + elseif(_cpu_part STREQUAL "0x243") + set(TARGET_ARCHITECTURE "i80321-600") + elseif(_cpu_part STREQUAL "0x290") + set(TARGET_ARCHITECTURE "pxa250b") + elseif(_cpu_part STREQUAL "0x292") + set(TARGET_ARCHITECTURE "pxa210b") + elseif(_cpu_part STREQUAL "0x2c2") + set(TARGET_ARCHITECTURE "i80321-400-b0") + elseif(_cpu_part STREQUAL "0x2c3") + set(TARGET_ARCHITECTURE "i80321-600-b0") + elseif(_cpu_part STREQUAL "0x2d0") + set(TARGET_ARCHITECTURE "pxa250c") + elseif(_cpu_part STREQUAL "0x2d2") + set(TARGET_ARCHITECTURE "pxa210c") + elseif(_cpu_part STREQUAL "0x411") + set(TARGET_ARCHITECTURE "pxa27x") + elseif(_cpu_part STREQUAL "0x41c") + set(TARGET_ARCHITECTURE "ipx425-533") + elseif(_cpu_part STREQUAL "0x41d") + set(TARGET_ARCHITECTURE "ipx425-400") + elseif(_cpu_part STREQUAL "0x41f") + set(TARGET_ARCHITECTURE "ipx425-266") + elseif(_cpu_part STREQUAL "0x682") + set(TARGET_ARCHITECTURE "pxa32x") + elseif(_cpu_part STREQUAL "0x683") + set(TARGET_ARCHITECTURE "pxa930") + elseif(_cpu_part STREQUAL "0x688") + set(TARGET_ARCHITECTURE "pxa30x") + elseif(_cpu_part STREQUAL "0x689") + set(TARGET_ARCHITECTURE "pxa31x") + elseif(_cpu_part STREQUAL "0xb11") + set(TARGET_ARCHITECTURE "sa1110") + elseif(_cpu_part STREQUAL "0xc12") + set(TARGET_ARCHITECTURE "ipx1200") + endif() + + # Phytium + elseif(_cpu_implementer STREQUAL "0x70") + if(_cpu_part STREQUAL "0x662") + set(TARGET_ARCHITECTURE "ftc662") + elseif(_cpu_part STREQUAL "0x663") + set(TARGET_ARCHITECTURE "ftc663") + endif() + + # Ampere + elseif(_cpu_implementer STREQUAL "0xc0") + + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + elseif(_cpu_implementer STREQUAL "16777228" OR _cpu_implementer STREQUAL "0x100000C") # Apple ARM64 + if( _cpu_part STREQUAL "0x1e2d6381" OR _cpu_part STREQUAL "506291073") # Swift (A6) + set(TARGET_ARCHITECTURE "apple-a6") + elseif(_cpu_part STREQUAL "0x37a09642" OR _cpu_part STREQUAL "933271106") # Cyclone (A7) + set(TARGET_ARCHITECTURE "apple-a7") + elseif(_cpu_part STREQUAL "0x2c91a47e" OR _cpu_part STREQUAL "747742334") # Typhoon (A8) + set(TARGET_ARCHITECTURE "apple-a8") + elseif(_cpu_part STREQUAL "0x92fb37c8" OR _cpu_part STREQUAL "2465937352") # Twister (A9) + set(TARGET_ARCHITECTURE "apple-a9") + elseif(_cpu_part STREQUAL "0x67ceee93" OR _cpu_part STREQUAL "1741614739") # Hurrican (A10) + set(TARGET_ARCHITECTURE "apple-a10") + elseif(_cpu_part STREQUAL "0xe81e7ef6" OR _cpu_part STREQUAL "3894312694") # Monsoon Mistral (A11) + set(TARGET_ARCHITECTURE "apple-a11") + elseif(_cpu_part STREQUAL "0x07d34b9f" OR _cpu_part STREQUAL "131287967") # Vortex Tempest (A12) + set(TARGET_ARCHITECTURE "apple-a12") + elseif(_cpu_part STREQUAL "0x462504d2" OR _cpu_part STREQUAL "1176831186") # Lightning Thunder (A13) + set(TARGET_ARCHITECTURE "apple-a13") + elseif(_cpu_part STREQUAL "0x1b588bb3" OR _cpu_part STREQUAL "458787763") # Firestorm Icestorm (A14 / M1 / M1 Pro / M1 Max) + set(TARGET_ARCHITECTURE "apple-m1") + elseif(_cpu_part STREQUAL "0xda33d83d" OR _cpu_part STREQUAL "3660830781") # Blizzard Avalanche (A15) + endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "CPU implementer: ${_cpu_implementer}") + message(STATUS "CPU architecture: ${_cpu_architecture}") + message(STATUS "CPU variant: ${_cpu_variant}") + message(STATUS "CPU part: ${_cpu_part}") + message(STATUS "CPU revision: ${_cpu_revision}") + endif() +endmacro(OFA_AutodetectArm) diff --git a/cmake/ofa/AutodetectPpc.cmake b/cmake/ofa/AutodetectPpc.cmake new file mode 100644 index 0000000000..4e66d1e7e8 --- /dev/null +++ b/cmake/ofa/AutodetectPpc.cmake @@ -0,0 +1,55 @@ +#============================================================================= +# Autodetection of PPC / PPC64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectPpc) + set(_cpu) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*cpu[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu "${_cpuinfo}") + if(_cpu STREQUAL "POWER3") + set(TARGET_ARCHITECTURE "power3") + elseif(_cpu STREQUAL "POWER4") + set(TARGET_ARCHITECTURE "power4") + elseif(_cpu STREQUAL "POWER5") + set(TARGET_ARCHITECTURE "power5") + elseif(_cpu STREQUAL "POWER5+") + set(TARGET_ARCHITECTURE "power5+") + elseif(_cpu STREQUAL "POWER6") + set(TARGET_ARCHITECTURE "power6") + elseif(_cpu STREQUAL "POWER6X") + set(TARGET_ARCHITECTURE "power6x") + elseif(_cpu STREQUAL "POWER7") + set(TARGET_ARCHITECTURE "power7") + elseif(_cpu STREQUAL "POWER8" OR _cpu STREQUAL "POWER8NVL") + set(TARGET_ARCHITECTURE "power8") + elseif(_cpu STREQUAL "POWER9" OR _cpu STREQUAL "POWER9NVL") + set(TARGET_ARCHITECTURE "power9") + elseif(_cpu STREQUAL "POWER10" OR _cpu STREQUAL "POWER10NVL") + set(TARGET_ARCHITECTURE "power10") + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + endif() + + # TODO: AIX, FreeBSD, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "CPU: ${_cpu}") + endif() +endmacro(OFA_AutodetectPpc) diff --git a/cmake/ofa/AutodetectX86.cmake b/cmake/ofa/AutodetectX86.cmake new file mode 100644 index 0000000000..a720e8caf7 --- /dev/null +++ b/cmake/ofa/AutodetectX86.cmake @@ -0,0 +1,356 @@ +#============================================================================= +# Autodetection of X86 / X86_64 CPUs +# +# This is a two-step process: +# +# 1. Get the CPUID from the system by reading /proc/cpuconfig (on +# Linux), the system registry (on Windows), or executing an +# OS-specific command (macOS, BSD, SunOS, ...) +# +# 2. Determine the specific CPU from the CPUID +#============================================================================= + +macro(OFA_AutodetectX86) + set(_vendor_id) + set(_cpu_family) + set(_cpu_model) + set(_cpu_stepping) + + # Get CPUID from system + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + + # Linux + file(READ "/proc/cpuinfo" _cpuinfo) + string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") + string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") + string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") + string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") + string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + + # macOS + exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor machdep.cpu.family machdep.cpu.model machdep.cpu.stepping machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _vendor_id) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_stepping) + list(GET _sysctl_output 4 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + else() + # Apple Silicon (ARM64) running in Rosetta 2 mode + # + # The regular detection mechanism for macOS-x64_86 does not work + # because the emulated CPU does not provide the required + # information via the sysctl command. We therefore generate fake + # vendor, model, and stepping information based on the + # macOS-specific CPU codes. + exec_program("/usr/sbin/sysctl -n hw.cputype machdep.cpu.family hw.cpufamily machdep.cpu.features" + OUTPUT_VARIABLE _sysctl_output_string RETURN_VALUE _error) + if(NOT _error) + string(REPLACE "\n" ";" _sysctl_output ${_sysctl_output_string}) + list(GET _sysctl_output 0 _cpu_implementer) + list(GET _sysctl_output 1 _cpu_family) + list(GET _sysctl_output 2 _cpu_model) + list(GET _sysctl_output 3 _cpu_flags) + string(TOLOWER "${_cpu_flags}" _cpu_flags) + string(REPLACE "." "_" _cpu_flags "${_cpu_flags}") + + # Fake vendor + if(_cpu_implementer STREQUAL "0x7" OR _cpu_implementer STREQUAL "7") + set(_vendor_id "GenuineIntel") + else() + set(_vendor_id "Unknown") + endif() + + # Fake stepping + set(_cpu_stepping "Unknown") + + # Fake model + # Taken from /Library/Developer/CommandLineTools/SDKs/MacOSX12.sdk/System/Library/Frameworks/Kernel.framework/Versions/A/Headers/mach/machine.h + if( _cpu_model STREQUAL "0x78ea4fbc" OR _cpu_model STREQUAL "2028621756") # Penryn + set(_cpu_model "23") + elseif(_cpu_model STREQUAL "0x6b5a4cd2" OR _cpu_model STREQUAL "1801080018") # Nehalem + set(_cpu_model "26") + elseif(_cpu_model STREQUAL "0x573b5eec" OR _cpu_model STREQUAL "1463508716") # Westmere + set(_cpu_model "37") + elseif(_cpu_model STREQUAL "0x5490b78c" OR _cpu_model STREQUAL "1418770316") # Sandybridge + set(_cpu_model "42") + elseif(_cpu_model STREQUAL "0x1f65e835" OR _cpu_model STREQUAL "526772277") # Ivybridge + set(_cpu_model "58") + elseif(_cpu_model STREQUAL "0x10b282dc" OR _cpu_model STREQUAL "280134364") # Haswell + set(_cpu_model "60") + elseif(_cpu_model STREQUAL "0x582ed09c" OR _cpu_model STREQUAL "1479463068") # Broadwell + set(_cpu_model "61") + elseif(_cpu_model STREQUAL "0x37fc219f" OR _cpu_model STREQUAL "939270559") # Skylake + set(_cpu_model "78") + elseif(_cpu_model STREQUAL "0x0f817246" OR _cpu_model STREQUAL "260141638") # Kabylake + set(_cpu_model "142") + elseif(_cpu_model STREQUAL "0x38435547" OR _cpu_model STREQUAL "943936839") # Icelake + set(_cpu_model "125") + elseif(_cpu_model STREQUAL "0x1cf8a03e" OR _cpu_model STREQUAL "486055998") # Cometlake + set(_cpu_model "142") + else() + set(_cpu_model "Unknown") + endif() + endif() + endif() + if(_error) + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") + endif() + + elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") + + # Windows + get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) + get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) + mark_as_advanced(_vendor_id _cpu_id) + string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") + string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") + string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") + + # TODO: BSD, Android, QNX, ... + + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() + + # Determine CPU from CPUID + if(_vendor_id STREQUAL "GenuineIntel") + if(_cpu_family EQUAL 6) + # taken from the Intel ORM + # http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html + # CPUID Signature Values of Of Recent Intel Microarchitectures + # 4E 5E | Skylake microarchitecture + # 3D 47 56 | Broadwell microarchitecture + # 3C 45 46 3F | Haswell microarchitecture + # 3A 3E | Ivy Bridge microarchitecture + # 2A 2D | Sandy Bridge microarchitecture + # 25 2C 2F | Intel microarchitecture Westmere + # 1A 1E 1F 2E | Intel microarchitecture Nehalem + # 17 1D | Enhanced Intel Core microarchitecture + # 0F | Intel Core microarchitecture + # + # Intel SDM Vol. 3C 35-1 / December 2016: + # 57 | Xeon Phi 3200, 5200, 7200 [Knights Landing] + # 85 | Future Xeon Phi + # 8E 9E | 7th gen. Core [Kaby Lake] + # 55 | Future Xeon [Skylake w/ AVX512] + # 4E 5E | 6th gen. Core / E3 v5 [Skylake w/o AVX512] + # 56 | Xeon D-1500 [Broadwell] + # 4F | Xeon E5 v4, E7 v4, i7-69xx [Broadwell] + # 47 | 5th gen. Core / Xeon E3 v4 [Broadwell] + # 3D | M-5xxx / 5th gen. [Broadwell] + # 3F | Xeon E5 v3, E7 v3, i7-59xx [Haswell-E] + # 3C 45 46 | 4th gen. Core, Xeon E3 v3 [Haswell] + # 3E | Xeon E5 v2, E7 v2, i7-49xx [Ivy Bridge-E] + # 3A | 3rd gen. Core, Xeon E3 v2 [Ivy Bridge] + # 2D | Xeon E5, i7-39xx [Sandy Bridge] + # 2F | Xeon E7 + # 2A | Xeon E3, 2nd gen. Core [Sandy Bridge] + # 2E | Xeon 7500, 6500 series + # 25 2C | Xeon 3600, 5600 series, Core i7, i5 and i3 + # + # Values from the Intel SDE: + # 5C | Goldmont + # 5A | Silvermont + # 57 | Knights Landing + # 66 | Cannonlake + # 55 | Skylake Server + # 4E | Skylake Client + # 3C | Broadwell (likely a bug in the SDE) + # 3C | Haswell + # + # Latest updates taken from https://en.wikichip.org/wiki/intel/cpuid + + # MIC architecture + if(_cpu_model EQUAL 133) + set(TARGET_ARCHITECTURE "knm") # Knights Mill + + elseif(_cpu_model EQUAL 87) + set(TARGET_ARCHITECTURE "knl") # Knights Landing + + # Small cores + elseif(_cpu_model EQUAL 134) + set(TARGET_ARCHITECTURE "tremont") + + elseif(_cpu_model EQUAL 122) + set(TARGET_ARCHITECTURE "goldmont-plus") + + elseif(_cpu_model EQUAL 92 OR _cpu_model EQUAL 95) + set(TARGET_ARCHITECTURE "goldmont") + + elseif(_cpu_model EQUAL 90 OR _cpu_model EQUAL 93 OR _cpu_model EQUAL 74 OR _cpu_model EQUAL 76 OR _cpu_model EQUAL 77 OR _cpu_model EQUAL 55) + set(TARGET_ARCHITECTURE "silvermont") + + elseif(_cpu_model EQUAL 28 OR _cpu_model EQUAL 38 OR _cpu_model EQUAL 39 OR _cpu_model EQUAL 53 OR _cpu_model EQUAL 54) + set(TARGET_ARCHITECTURE "bonnell") + + # Big cores + elseif(_cpu_model EQUAL 167) + set(TARGET_ARCHITECTURE "rocketlake") + + elseif(_cpu_model EQUAL 151 OR _cpu_model EQUAL 154) + set(TARGET_ARCHITECTURE "alderlake") + + elseif(_cpu_model EQUAL 143) + set(TARGET_ARCHITECTURE "sapphirerapids") + + elseif(_cpu_model EQUAL 142 OR _cpu_model EQUAL 158 OR _cpu_model EQUAL 165) + set(TARGET_ARCHITECTURE "kabylake") + + elseif(_cpu_model EQUAL 140) + set(TARGET_ARCHITECTURE "tigerlake") + + elseif(_cpu_model EQUAL 125 OR _cpu_model EQUAL 126) + set(TARGET_ARCHITECTURE "icelake") + + elseif(_cpu_model EQUAL 106 OR _cpu_model EQUAL 108) + set(TARGET_ARCHITECTURE "icelake-avx512") + + elseif(_cpu_model EQUAL 102) + set(TARGET_ARCHITECTURE "cannonlake") + + elseif(_cpu_model EQUAL 85) + if(_cpu_stepping LESS 5) + set(TARGET_ARCHITECTURE "skylake-avx512") + elseif(_cpu_stepping LESS 8) + set(TARGET_ARCHITECTURE "cascadelake") + else() + set(TARGET_ARCHITECTURE "cooperlake") + endif() + + elseif(_cpu_model EQUAL 78 OR _cpu_model EQUAL 94) + set(TARGET_ARCHITECTURE "skylake") + + elseif(_cpu_model EQUAL 61 OR _cpu_model EQUAL 71 OR _cpu_model EQUAL 79 OR _cpu_model EQUAL 86) + set(TARGET_ARCHITECTURE "broadwell") + + elseif(_cpu_model EQUAL 60 OR _cpu_model EQUAL 69 OR _cpu_model EQUAL 70 OR _cpu_model EQUAL 63) + set(TARGET_ARCHITECTURE "haswell") + + elseif(_cpu_model EQUAL 58 OR _cpu_model EQUAL 62) + set(TARGET_ARCHITECTURE "ivybridge") + + elseif(_cpu_model EQUAL 42 OR _cpu_model EQUAL 45) + set(TARGET_ARCHITECTURE "sandybridge") + + elseif(_cpu_model EQUAL 37 OR _cpu_model EQUAL 44 OR _cpu_model EQUAL 47) + set(TARGET_ARCHITECTURE "westmere") + + elseif(_cpu_model EQUAL 26 OR _cpu_model EQUAL 30 OR _cpu_model EQUAL 31 OR _cpu_model EQUAL 46) + set(TARGET_ARCHITECTURE "nehalem") + + elseif(_cpu_model EQUAL 23 OR _cpu_model EQUAL 29) + set(TARGET_ARCHITECTURE "penryn") + + elseif(_cpu_model EQUAL 15 OR _cpu_model EQUAL 22) + set(TARGET_ARCHITECTURE "merom") + + elseif(_cpu_model EQUAL 28) + set(TARGET_ARCHITECTURE "atom") + + elseif(_cpu_model EQUAL 14) + set(TARGET_ARCHITECTURE "core") + + elseif(_cpu_model LESS 14) + message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.") + set(TARGET_ARCHITECTURE "generic") + else() + message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.") + set(TARGET_ARCHITECTURE "merom") + endif() + + elseif(_cpu_family EQUAL 7) # Itanium (not supported) + message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.") + + elseif(_cpu_family EQUAL 15) # NetBurst + list(APPEND _available_vector_units_list "sse" "sse2") + if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead + list(APPEND _available_vector_units_list "sse" "sse2" "sse3") + endif() + + endif() + + elseif(_vendor_id STREQUAL "AuthenticAMD") + # taken from the list of AMD CPU microarchitectures + # https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures + # CPUID Signature Values of Of Recent AMD Microarchitectures + # 05 05h | K6 + # 06 06h | K7 + # 15 0Fh | K8 / Hammer + # 16 10h | K10 + # 17 11h | K8 & K10 "hybrid" + # 18 12h | K10 (Llano) / K12 (ARM based AMD cpu) + # 20 14h | Bobcat + # 21 15h | Bulldozer / Piledriver / Steamroller / Excavator + # 22 16h | Jaguar / Puma + # 23 17h | Zen / Zen+ / Zen 2 + # 24 18h | Hygon Dhyana + # 25 19h | Zen 3 + + if(_cpu_family EQUAL 25) # 19h + set(TARGET_ARCHITECTURE "zen3") + + elseif(_cpu_family EQUAL 24) # 18h + set(TARGET_ARCHITECTURE "zen") + + elseif(_cpu_family EQUAL 23) # 17h + if(_cpu_model LESS 49) + set(TARGET_ARCHITECTURE "zen") + else() + set(TARGET_ARCHITECTURE "zen2") + endif() + + elseif(_cpu_family EQUAL 22) # 16h + set(TARGET_ARCHITECTURE "amd16h") + + elseif(_cpu_family EQUAL 21) # 15h + if(_cpu_model LESS 16) + set(TARGET_ARCHITECTURE "bulldozer") + elseif(_cpu_model LESS 32) + set(TARGET_ARCHITECTURE "piledriver") + elseif(_cpu_model LESS 64) + set(TARGET_ARCHITECTURE "steamroller") + else() + set(TARGET_ARCHITECTURE "excavator") + endif() + + elseif(_cpu_family EQUAL 20) # 14h + set(TARGET_ARCHITECTURE "amd14h") + + elseif(_cpu_family EQUAL 18) # 12h (K10 / K12) + + elseif(_cpu_family EQUAL 17) # 12h (K8 & K10 hybrid) + + elseif(_cpu_family EQUAL 16) # 10h (K10) + set(TARGET_ARCHITECTURE "barcelona") + + elseif(_cpu_family EQUAL 15) # 0Fh (K8 / Hammer) + if(_cpu_model LESS 39) + set(TARGET_ARCHITECTURE "k8") + else() + set(TARGET_ARCHITECTURE "k8-sse3") + endif() + + elseif(_cpu_family EQUAL 6) # 06h (K7) + elseif(_cpu_family EQUAL 5) # 05h (K6) + + endif() + + else() + message(WARNING "Auto-detection of optimization flags failed and will use the generic CPU settings.") + return() + endif() + + if(OFA_VERBOSE) + message(STATUS "Vendor id: ${_vendor_id}") + message(STATUS "CPU family: ${_cpu_family}") + message(STATUS "CPU mode: ${_cpu_model}") + message(STATUS "CPU stepping: ${_cpu_stepping}") + endif() +endmacro(OFA_AutodetectX86) diff --git a/cmake/CheckCCompilerFlag.cmake b/cmake/ofa/CheckCCompilerFlag.cmake similarity index 100% rename from cmake/CheckCCompilerFlag.cmake rename to cmake/ofa/CheckCCompilerFlag.cmake diff --git a/cmake/CheckCXXCompilerFlag.cmake b/cmake/ofa/CheckCXXCompilerFlag.cmake similarity index 100% rename from cmake/CheckCXXCompilerFlag.cmake rename to cmake/ofa/CheckCXXCompilerFlag.cmake diff --git a/cmake/CheckMicCCompilerFlag.cmake b/cmake/ofa/CheckMicCCompilerFlag.cmake similarity index 100% rename from cmake/CheckMicCCompilerFlag.cmake rename to cmake/ofa/CheckMicCCompilerFlag.cmake diff --git a/cmake/CheckMicCXXCompilerFlag.cmake b/cmake/ofa/CheckMicCXXCompilerFlag.cmake similarity index 100% rename from cmake/CheckMicCXXCompilerFlag.cmake rename to cmake/ofa/CheckMicCXXCompilerFlag.cmake diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake new file mode 100644 index 0000000000..a29280c476 --- /dev/null +++ b/cmake/ofa/HandleArmOptions.cmake @@ -0,0 +1,926 @@ +#============================================================================= +# Handling of ARM / ARM64 options +# +# This is a two-step process: +# +# 1. Generate a list of compiler flags for the specific CPU +# +# 2. Special compiler-specific treatment of "native" flag +# +# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options +# +# 4. Set compiler-specific flags +#============================================================================= + +include(ofa/AddCompilerFlag) +include(CheckIncludeFileCXX) + +macro(OFA_HandleArmOptions) + set(_march_flag_list) + set(_mtune_flag_list) + set(_available_vector_units_list) + + # ARM + if(TARGET_ARCHITECTURE STREQUAL "strongarm") + list(APPEND _mtune_flag_list "strongarm") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm8") + list(APPEND _mtune_flag_list "arm8") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm810") + list(APPEND _mtune_flag_list "arm810") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa526") + list(APPEND _mtune_flag_list "fa526") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626") + list(APPEND _mtune_flag_list "fa626") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") + list(APPEND _mtune_flag_list "arm7tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") + list(APPEND _mtune_flag_list "arm7tdmi-s") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") + list(APPEND _mtune_flag_list "arm710t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") + list(APPEND _mtune_flag_list "arm720t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") + list(APPEND _mtune_flag_list "arm740t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9") + list(APPEND _mtune_flag_list "arm9") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") + list(APPEND _mtune_flag_list "arm9tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920") + list(APPEND _mtune_flag_list "arm920") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") + list(APPEND _mtune_flag_list "arm920t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") + list(APPEND _mtune_flag_list "arm922t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") + list(APPEND _mtune_flag_list "arm940t") + list(APPEND _march_flag_list "armv4t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") + list(APPEND _mtune_flag_list "arm1020t") + list(APPEND _march_flag_list "armv5t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") + list(APPEND _mtune_flag_list "arm10tdmi") + list(APPEND _march_flag_list "armv5t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") + list(APPEND _mtune_flag_list "arm9e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") + list(APPEND _mtune_flag_list "arm946e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") + list(APPEND _mtune_flag_list "arm966e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") + list(APPEND _mtune_flag_list "arm968e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") + list(APPEND _mtune_flag_list "arm10e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") + list(APPEND _mtune_flag_list "arm1020e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") + list(APPEND _mtune_flag_list "arm1022e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "xscale") + list(APPEND _mtune_flag_list "xscale") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") + list(APPEND _mtune_flag_list "iwmmxt") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") + list(APPEND _mtune_flag_list "iwmmxt2") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") + list(APPEND _mtune_flag_list "fa606te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") + list(APPEND _mtune_flag_list "fa626te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") + list(APPEND _mtune_flag_list "fmp626") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") + list(APPEND _mtune_flag_list "fa726te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") + list(APPEND _mtune_flag_list "arm926ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") + list(APPEND _mtune_flag_list "arm1026ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_vector_units_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") + list(APPEND _mtune_flag_list "mpcore") + list(APPEND _march_flag_list "armv6k") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") + list(APPEND _mtune_flag_list "arm1136j-s") + list(APPEND _march_flag_list "armv6j") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") + list(APPEND _mtune_flag_list "arm1136jf-s") + list(APPEND _march_flag_list "armv6j") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") + list(APPEND _mtune_flag_list "arm1156t2-s") + list(APPEND _march_flag_list "armv6t2") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") + list(APPEND _mtune_flag_list "arm1156t2f-s") + list(APPEND _march_flag_list "armv6t2") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") + list(APPEND _mtune_flag_list "arm1176jz-s") + list(APPEND _march_flag_list "armv6kz") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") + list(APPEND _mtune_flag_list "arm1176jzf-s") + list(APPEND _march_flag_list "armv6kz") + list(APPEND _available_vector_units_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") + list(APPEND _mtune_flag_list "generic-armv7-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") + list(APPEND _mtune_flag_list "cortex-a5") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") + list(APPEND _mtune_flag_list "cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") + list(APPEND _mtune_flag_list "cortex-a8") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "sec" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") + list(APPEND _mtune_flag_list "cortex-a9") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") + list(APPEND _mtune_flag_list "cortex-a12") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") + list(APPEND _mtune_flag_list "cortex-a15") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") + list(APPEND _mtune_flag_list "cortex-a17") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_vector_units_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") + list(APPEND _mtune_flag_list "cortex-a32") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") + list(APPEND _mtune_flag_list "cortex-a34") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") + list(APPEND _mtune_flag_list "cortex-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") + list(APPEND _mtune_flag_list "cortex-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") + list(APPEND _mtune_flag_list "cortex-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") + list(APPEND _mtune_flag_list "cortex-a57") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") + list(APPEND _mtune_flag_list "cortex-a72") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") + list(APPEND _mtune_flag_list "cortex-a73") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") + list(APPEND _mtune_flag_list "cortex-a75") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") + list(APPEND _mtune_flag_list "cortex-a76") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") + list(APPEND _mtune_flag_list "cortex-a76ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") + list(APPEND _mtune_flag_list "cortex-a77") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") + list(APPEND _mtune_flag_list "cortex-a78ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") + list(APPEND _mtune_flag_list "cortex-a78c") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a510") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a710") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") + list(APPEND _mtune_flag_list "cortex-m0") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") + list(APPEND _mtune_flag_list "cortex-m0plus") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") + list(APPEND _mtune_flag_list "cortex-m1") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") + list(APPEND _mtune_flag_list "cortex-m3") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") + list(APPEND _mtune_flag_list "cortex-m4") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") + list(APPEND _mtune_flag_list "cortex-m7") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_vector_units_list "fp.dp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") + list(APPEND _mtune_flag_list "cortex-m23") + list(APPEND _march_flag_list "armv8-m.base") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") + list(APPEND _mtune_flag_list "cortex-m33") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_vector_units_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") + list(APPEND _mtune_flag_list "cortex-m35p") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_vector_units_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") + list(APPEND _mtune_flag_list "cortex-m55") + list(APPEND _march_flag_list "armv8.1-m.main") + list(APPEND _march_flag_list "armv8-m") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_vector_units_list "mve.fp" "fp.dp") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") + list(APPEND _mtune_flag_list "cortex-r4") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") + list(APPEND _mtune_flag_list "cortex-r4f") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") + list(APPEND _mtune_flag_list "cortex-r5") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") + list(APPEND _mtune_flag_list "cortex-r7") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") + list(APPEND _mtune_flag_list "cortex-r8") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") + list(APPEND _mtune_flag_list "cortex-r52") + list(APPEND _march_flag_list "armv8-r") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") + list(APPEND _mtune_flag_list "cortex-x1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") + list(APPEND _mtune_flag_list "neoverse-e1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") + list(APPEND _mtune_flag_list "neoverse-n1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") + list(APPEND _mtune_flag_list "neoverse-n2") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") + list(APPEND _mtune_flag_list "neoverse-v1") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") + + # Broadcom + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") + list(APPEND _mtune_flag_list "brahma-b15") + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") + list(APPEND _mtune_flag_list "brahma-b53") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") + list(APPEND _mtune_flag_list "thunderx2") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crypto") + + # Cavium + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") + list(APPEND _mtune_flag_list "thunderx") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") + list(APPEND _mtune_flag_list "thunderxt88") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") + list(APPEND _mtune_flag_list "thunderxt81") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") + list(APPEND _mtune_flag_list "thunderxt83") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") + list(APPEND _mtune_flag_list "thunderx2t99") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto") + + # DEC + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") + list(APPEND _mtune_flag_list "strongarm110") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") + list(APPEND _mtune_flag_list "strongarm1100") + list(APPEND _march_flag_list "armv4") + + # FUJITSU + elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") + list(APPEND _mtune_flag_list "a64fx") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "fp16" "sve") + + # HiSilicon + elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") + list(APPEND _mtune_flag_list "tsv110") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") + + # Nvidia + elseif(TARGET_ARCHITECTURE STREQUAL "denver") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "denver2") + list(APPEND _mtune_flag_list "denver2") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "carmel") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # APM + elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") + list(APPEND _mtune_flag_list "xgene1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + + # Qualcomm + elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") + list(APPEND _mtune_flag_list "scorpion") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "krait") + list(APPEND _mtune_flag_list "krait") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo") + list(APPEND _mtune_flag_list "kryo") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") + list(APPEND _mtune_flag_list "kryo2") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "falkor") + list(APPEND _mtune_flag_list "falkor") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "saphira") + list(APPEND _mtune_flag_list "saphira") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # Samsung + elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") + list(APPEND _mtune_flag_list "exynos-m1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "crypto" "simd") + + # Marvell + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") + list(APPEND _mtune_flag_list "marvell-f") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") + list(APPEND _mtune_flag_list "marvell-pj4") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_vector_units_list "mp" "sec" "fp") + + # Intel + elseif(TARGET_ARCHITECTURE STREQUAL "i80200") + list(APPEND _mtune_flag_list "i80200") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") + list(APPEND _mtune_flag_list "pxa250a") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") + list(APPEND _mtune_flag_list "pxa210a") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") + list(APPEND _mtune_flag_list "i80321-400") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") + list(APPEND _mtune_flag_list "i80321-600") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") + list(APPEND _mtune_flag_list "pxa250b") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") + list(APPEND _mtune_flag_list "pxa210b") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") + list(APPEND _mtune_flag_list "i80321-400-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") + list(APPEND _mtune_flag_list "i80321-600-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") + list(APPEND _mtune_flag_list "pxa250c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") + list(APPEND _mtune_flag_list "pxa210c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") + list(APPEND _mtune_flag_list "pxa27x") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") + list(APPEND _mtune_flag_list "ipx425-533") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") + list(APPEND _mtune_flag_list "ipx425-400") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") + list(APPEND _mtune_flag_list "ipx425-266") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") + list(APPEND _mtune_flag_list "pxa32x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") + list(APPEND _mtune_flag_list "pxa930") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") + list(APPEND _mtune_flag_list "pxa30x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") + list(APPEND _mtune_flag_list "pxa31x") + elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") + list(APPEND _mtune_flag_list "sa1110") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") + list(APPEND _mtune_flag_list "ipx1200") + + # Apple + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") + list(APPEND _mtune_flag_list "apple-a6") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") + list(APPEND _mtune_flag_list "apple-a7") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") + list(APPEND _mtune_flag_list "apple-a8") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") + list(APPEND _mtune_flag_list "apple-a9") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") + list(APPEND _mtune_flag_list "apple-a10") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") + list(APPEND _mtune_flag_list "apple-a11") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") + list(APPEND _mtune_flag_list "apple-a12") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") + list(APPEND _mtune_flag_list "apple-a13") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") + list(APPEND _mtune_flag_list "vortex") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "native") + list(APPEND _march_flag_list "native") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + # Special treatment for "native" + if(TARGET_ARCHITECTURE STREQUAL "native") + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR, "MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + + # Apply architecture flags + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + # TODO: Add OFA_xxx_INTRINSICS_BROKEN rules + set(_aes_broken false) + set(_bf16_broken false) + set(_crc_broken false) + set(_crypto_broken false) + set(_dotprod_broken false) + set(_dsp_broken false) + set(_fp16_broken false) + set(_fp16fml_broken false) + set(_fp_broken false) + set(_fp_dp_broken false) + set(_fp_sp_broken false) + set(_i8mm_broken false) + set(_idiv_broken false) + set(_lse_broken false) + set(_mve_broken false) + set(_mve_fp_broken false) + set(_neon_broken false) + set(_neon_fp16_broken false) + set(_neon_vfpv4_broken false) + set(_ras_broken false) + set(_rcpc_broken false) + set(_rdm_broken false) + set(_rdma_broken false) + set(_sec_broken false) + set(_sha2_broken false) + set(_sha3_broken false) + set(_simd_broken false) + set(_sm4_broken false) + set(_sve_broken false) + set(_vfpv3_broken false) + set(_vfpv3_d16_broken false) + set(_vfpv3_d16_fp16_broken false) + set(_vfpv3_fp16_broken false) + set(_vfpv4_broken false) + set(_vfpv4_d16_broken false) + set(_zcm_broken false) + set(_zcz_broken false) + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) + _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) + _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) + _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) + _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) + _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) + _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) + _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) + _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) + _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) + _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) + _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) + _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) + _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) + _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) + _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) + _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) + _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) + _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) + _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) + _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) + _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) + _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) + _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) + _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) + _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) + _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) + _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) + _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) + _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) + _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) + _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) + _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) + _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) + _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) + _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) + _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) + + # Add compiler flags + if(MSVC AND MSVC_VERSION GREATER 1900) + _my_find(_enable_vector_unit_list "vfpv4" _found) + if(_found) + AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _my_find(_enable_vector_unit_list "simd" _found) + if(_found) + AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + foreach(_flag ${_enable_vector_unit_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "." "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + + # TODO: Add Cray flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") + + # TODO: Add Fujitsu flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + # TODO: Add NVHPC flags + + else() + # Others: GNU, Clang and variants + + # Following the recommendation from + # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu + # we first try to only use the -mcpu flag. If that fails, e.g., if + # the compiler does not yet support the specified target, we try to + # set the -march and -mtune flags as fallback option. + foreach(_flag ${_mtune_flag_list}) + AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Fallback: set -march, -mtune flags + if(NOT _good) + # Set -march flag + foreach(_march ${_march_flag_list}) + AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march}") + foreach(_flag ${_enable_vector_unit_list}) + AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) + if(_good) + set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") + endif(_good) + endforeach(_flag) + AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + break() + endif(_good) + endforeach(_march) + + # Set -mtune flag + foreach(_mtune ${_mtune_flag_list}) + AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_mtune) + endif(NOT _good) + + # Note that ARM does not support -mfeature and -mno-feature to + # enable and disable specific features. Hence, there are no + # loops over the _enable_vector_unit_list and + # _disable_vector_unit_list lists here(!) + endif() + endif() + + # Compile code with profiling instrumentation + if(TARGET_PROFILER STREQUAL "gprof") + AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() +endmacro(OFA_HandleArmOptions) diff --git a/cmake/ofa/HandlePpcOptions.cmake b/cmake/ofa/HandlePpcOptions.cmake new file mode 100644 index 0000000000..eb5b2d9c2c --- /dev/null +++ b/cmake/ofa/HandlePpcOptions.cmake @@ -0,0 +1,139 @@ +#============================================================================= +# Handling of PPC / PPC64 options +# +# This is a two-step process: +# +# 1. Generate a list of compiler flags for the specific CPU +# +# 2. Special compiler-specific treatment of "native" flag +# +# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options +# +# 4. Set compiler-specific flags +#============================================================================= + +include(ofa/AddCompilerFlag) +include(CheckIncludeFileCXX) + +macro(OFA_HandlePpcOptions) + set(_march_flag_list) + set(_available_vector_units_list) + + # Define macros for PowerPC64 + macro(_power3) + list(APPEND _march_flag_list "power3") + endmacro() + macro(_power4) + list(APPEND _march_flag_list "power4") + _power3() + endmacro() + macro(_power5) + list(APPEND _march_flag_list "power5") + _power4() + endmacro() + macro(_power5plus) + list(APPEND _march_flag_list "power5+") + _power5() + endmacro() + macro(_power6) + list(APPEND _march_flag_list "power6") + _power5() + endmacro() + macro(_power6x) + list(APPEND _march_flag_list "power6x") + _power6() + endmacro() + macro(_power7) + list(APPEND _march_flag_list "power7") + _power6() + endmacro() + macro(_power8) + list(APPEND _march_flag_list "power8") + list(APPEND _march_flag_list "pwr8") + _power7() + endmacro() + macro(_power9) + list(APPEND _march_flag_list "power9") + list(APPEND _march_flag_list "pwr9") + _power8() + endmacro() + macro(_power10) + list(APPEND _march_flag_list "power10") + list(APPEND _march_flag_list "pwr10") + _power9() + endmacro() + + # PowerPC64 + if(TARGET_ARCHITECTURE STREQUAL "power3") + _power3() + elseif(TARGET_ARCHITECTURE STREQUAL "power4") + _power4() + elseif(TARGET_ARCHITECTURE STREQUAL "power5") + _power5() + elseif(TARGET_ARCHITECTURE STREQUAL "power5+") + _power5plus() + elseif(TARGET_ARCHITECTURE STREQUAL "power6") + _power6() + elseif(TARGET_ARCHITECTURE STREQUAL "power6x") + _power6x() + elseif(TARGET_ARCHITECTURE STREQUAL "power7") + _power7() + elseif(TARGET_ARCHITECTURE STREQUAL "power8") + _power8() + elseif(TARGET_ARCHITECTURE STREQUAL "power9") + _power9() + elseif(TARGET_ARCHITECTURE STREQUAL "power10") + _power10() + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "native") + list(APPEND _march_flag_list "native") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + # Special treatment for "native" + if(TARGET_ARCHITECTURE STREQUAL "native") + + # Apply architecture flags + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options + set(_disable_vector_unit_list) + set(_enable_vector_unit_list) + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_vector_units_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_vector_unit_list "${_flag}") + else() + list(APPEND _disable_vector_unit_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + + # Add compiler flags + if(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "XL") + + else() + # Others: GNU, Clang and variants + + + endif() + endif() +endmacro(OFA_HandlePpcOptions) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake new file mode 100644 index 0000000000..2a6ece54f1 --- /dev/null +++ b/cmake/ofa/HandleX86Options.cmake @@ -0,0 +1,803 @@ +#============================================================================= +# Handling of X86 / X86_64 options +# +# This is a two-step process: +# +# 1. Generate a list of compiler flags for the specific CPU +# +# 2. Apply compiler flags +# +# 2.1 Enable/disable features based on compiler capabilities +# +# 4. Set compiler-specific flags +#============================================================================= + +include(ofa/AddCompilerFlag) +include(CheckIncludeFileCXX) + +macro(OFA_HandleX86Options) + + # Special treatment for "native" flag + if(TARGET_ARCHITECTURE STREQUAL "native") + if(MSVC) + # MSVC (on Windows) + message(FATAL_ERROR, "[OFA] MSVC does not support \"native\" flag.") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + if(WIN32) + # Intel (on Windows) + AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + # Intel (on Linux) + AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" + OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") + # NVidia HPC / PGI (on Linux/Windows + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + # Sun/Oracle Studio (on Linux/Sun OS) + AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + # Cray (on Linux) + message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") + else() + # Others: GNU, Clang and variants + _my_find(OFA_ARCHITECTURE_FLAGS "-march=native" _found) + if(NOT _found) + AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + return() + endif() + + # Step 1: Generate a list of compiler flags for the specific CPU + set(_march_flag_list) + set(_available_extension_list) + + # Define macros for Intel + macro(_nehalem) + list(APPEND _march_flag_list "nehalem") + list(APPEND _march_flag_list "corei7") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") + endmacro() + macro(_westmere) + list(APPEND _march_flag_list "westmere") + _nehalem() + list(APPEND _available_extension_list "aes" "pclmul") + endmacro() + macro(_sandybridge) + list(APPEND _march_flag_list "sandybridge") + list(APPEND _march_flag_list "corei7-avx") + _westmere() + list(APPEND _available_extension_list "avx") + endmacro() + macro(_ivybridge) + list(APPEND _march_flag_list "ivybridge") + list(APPEND _march_flag_list "core-avx-i") + _sandybridge() + list(APPEND _available_extension_list "rdrnd" "f16c" "fsgsbase") + endmacro() + macro(_haswell) + list(APPEND _march_flag_list "haswell") + list(APPEND _march_flag_list "core-avx2") + _ivybridge() + list(APPEND _available_extension_list "avx2" "fma" "bmi" "bmi2") + endmacro() + macro(_broadwell) + list(APPEND _march_flag_list "broadwell") + _haswell() + list(APPEND _available_extension_list "rdseed" "adcx" "prfchw") + endmacro() + macro(_skylake) + list(APPEND _march_flag_list "skylake") + _broadwell() + list(APPEND _available_extension_list "clflushopt" "xsavec" "xsaves") + endmacro() + macro(_skylake_avx512) + list(APPEND _march_flag_list "skylake-avx512") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") + endmacro() + macro(_cascadelake) + list(APPEND _march_flag_list "cascadelake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512vnni") + endmacro() + macro(_cooperlake) + list(APPEND _march_flag_list "cooperlake") + _cascadelake() + list(APPEND _available_extension_list "avx512bf16") + endmacro() + macro(_cannonlake) + list(APPEND _march_flag_list "cannonlake") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") + endmacro() + macro(_icelake) + list(APPEND _march_flag_list "icelake-client") + _cannonlake() + list(APPEND _available_extension_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") + endmacro() + macro(_icelake_avx512) + list(APPEND _march_flag_list "icelake-server") + _icelake() + list(APPEND _available_extension_list "pconfig" "wbnoinvd") + endmacro() + macro(_tigerlake) + list(APPEND _march_flag_list "tigerlake") + _icelake() + list(APPEND _available_extension_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") + endmacro() + macro(_alderlake) + list(APPEND _march_flag_list "alderlake") + _broadwell() + list(APPEND _available_extension_list "avxvnni" "cldemote" "clwb" "gfni" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") + endmacro() + macro(_sapphirerapids) + list(APPEND _march_flag_list "sapphirerapids") + _skylake_avx512() + list(APPEND _available_extension_list "amx-bf16" "amx-int8" "amx-tile" "avxvnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") + endmacro() + macro(_rocketlake) + list(APPEND _march_flag_list "rocketlake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") + endmacro() + macro(_knightslanding) + list(APPEND _march_flag_list "knl") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd") + endmacro() + macro(_knightsmill) + list(APPEND _march_flag_list "knm") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") + endmacro() + macro(_silvermont) + list(APPEND _march_flag_list "silvermont") + _westmere() + list(APPEND _available_extension_list "rdrnd") + endmacro() + macro(_goldmont) + list(APPEND _march_flag_list "goldmont") + _silvermont() + list(APPEND _available_extension_list "rdseed") + endmacro() + macro(_goldmont_plus) + list(APPEND _march_flag_list "goldmont-plus") + _goldmont() + list(APPEND _available_extension_list "rdpid") + endmacro() + macro(_tremont) + list(APPEND _march_flag_list "tremont") + _goldmont_plus() + endmacro() + + # TODO: Define similar macros for AMD + + # Intel + if(TARGET_ARCHITECTURE STREQUAL "core") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3") + elseif(TARGET_ARCHITECTURE STREQUAL "merom") + list(APPEND _march_flag_list "merom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "penryn") + list(APPEND _march_flag_list "penryn") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + message(STATUS "[OFA] Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") + if(_cpu_flags MATCHES "sse4_1") + message(STATUS "[OFA] SSE4.1: enabled (auto-detected from this computer's CPU flags)") + list(APPEND _available_extension_list "sse4.1") + else() + message(STATUS "[OFA] SSE4.1: disabled (auto-detected from this computer's CPU flags)") + endif() + elseif(TARGET_ARCHITECTURE STREQUAL "knm") + _knightsmill() + elseif(TARGET_ARCHITECTURE STREQUAL "knl") + _knightslanding() + elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") + _rocketlake() + elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") + _sapphirerapids() + elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") + _alderlake() + elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") + _tigerlake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake") + _icelake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") + _icelake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") + _cannonlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") + _cooperlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") + _cascadelake() + elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") + _skylake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") + _broadwell() + elseif(TARGET_ARCHITECTURE STREQUAL "haswell") + _haswell() + elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") + _ivybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") + _sandybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "westmere") + _westmere() + elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") + _nehalem() + elseif(TARGET_ARCHITECTURE STREQUAL "tremont") + _tremont() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") + _goldmont_plus() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") + _goldmont() + elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") + _silvermont() + elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") + list(APPEND _march_flag_list "bonnell") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "atom") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + + # AMD + elseif(TARGET_ARCHITECTURE STREQUAL "k8") + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "sse" "sse2") + elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") + list(APPEND _march_flag_list "k8-sse3") + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "sse" "sse2" "sse3") + elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") + list(APPEND _march_flag_list "btver2") + list(APPEND _march_flag_list "btver1") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") + elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") + list(APPEND _march_flag_list "btver1") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen3") + list(APPEND _march_flag_list "znver2") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen2") + list(APPEND _march_flag_list "znver2") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "excavator") + list(APPEND _march_flag_list "bdver4") + list(APPEND _march_flag_list "bdver3") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") + elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") + list(APPEND _march_flag_list "bdver3") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") + elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") + elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") + elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") + elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + list(SORT _available_extension_list) + list(REMOVE_DUPLICATES _available_extension_list) + + if(OFA_VERBOSE) + string(REPLACE ";" ", " _str "${_march_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU architectures: " ${_str}) + string(REPLACE ";" ", " _str "${_available_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (available): ${_str}") + endif() + + # Step 2: Apply compiler flags + if(NOT TARGET_ARCHITECTURE STREQUAL "none") + set(_check_extension_list) + set(_disable_extension_list) + set(_enable_extension_list) + + # Step 2.1: Enable/disable features based on compiler capabilities + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/IntrinsicsX86.txt _intrinsics) + string(REPLACE ";" "|" _intrinsics "${_intrinsics}") + string(REPLACE "\n" ";" _intrinsics "${_intrinsics}") + foreach (_intrinsic ${_intrinsics}) + if ("${_intrinsic}" MATCHES "^#" ) # Skip comment + continue() + endif() + string(REPLACE "|" ";" _intrinsic "${_intrinsic}") + list(GET _intrinsic 0 _intrinsic_flags) + list(GET _intrinsic 1 _intrinsic_header) + list(GET _intrinsic 2 _intrinsic_name) + list(GET _intrinsic 3 _intrinsic_params) + + string(REPLACE "," ";" _intrinsic_flags "${_intrinsic_flags}") + list(GET _intrinsic_flags 0 _flag) + string(REPLACE ";" " -m" _intrinsic_flags "-m${_intrinsic_flags}") + list(APPEND _check_extension_list "${_flag}") + + # Check if include file is available + set(_resultVar "HAVE_${_intrinsic_header}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + check_include_file_cxx( + ${_intrinsic_header} + ${_resultVar} + ${_intrinsic_flags}) + if(NOT ${_resultVar}) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + message(STATUS "[OFA] Disabling ${_useVar} because ${_intrinsic_header} is missing") + set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) + mark_as_advanced(${_useVar}) + continue() + endif() + + # Check if compiler supports flag and can compile code + set(_resultVar "HAVE_${_flag}_${_intrinsic_name}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + set(CMAKE_REQUIRED_FLAGS "${_intrinsic_flags}") + check_cxx_source_compiles( + "#include<${_intrinsic_header}> + int main() { + ${_intrinsic_name}(${_intrinsic_params}); + return 0; + }" + ${_resultVar}) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + if (NOT ${_resultVar}) + message(STATUS "[OFA] Disabling ${_useVar} because -m${_flag} is not supported by compiler and/or ${_intrinsic_name} intrinsics fails to compile") + set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) + mark_as_advanced(${_useVar}) + else() + set(${_useVar} true CACHE BOOL "Use ${_flag} extension.") + mark_as_advanced(${_useVar}) + endif() + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_list) + foreach(_flag ${_check_extension_list}) + _my_find(_available_extension_list "${_flag}" _found) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + if(_found AND ${_useVar}) + list(APPEND _enable_extension_list "${_flag}") + else() + list(APPEND _disable_extension_list "${_flag}") + endif() + endforeach() + + if(OFA_VERBOSE) + string(REPLACE ";" ", " _str "${_enable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (enabled): ${_str}") + string(REPLACE ";" ", " _str "${_disable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (disabled): ${_str}") + endif() + +if(FALSE) + set(_generic_broken false) + set(_amx_broken false) + set(_avxvnni_broken false) + + + if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) + message(STATUS "SSE disabled because of old/broken toolchain") + set(_sse_broken true) + set(_avx_broken true) + set(_avx2_broken true) + set(_avx512_broken true) + set(_fma4_broken true) + set(_xop_broken true) + elseif(DEFINED OFA_AVX_INTRINSICS_BROKEN AND OFA_AVX_INTRINSICS_BROKEN) + message(STATUS "AVX disabled because of old/broken toolchain") + set(_sse_broken false) + set(_avx_broken true) + set(_avx2_broken true) + set(_avx512_broken true) + set(_fma4_broken true) + set(_xop_broken true) + else() + set(_sse_broken false) + set(_avx_broken false) + if(DEFINED OFA_FMA4_INTRINSICS_BROKEN AND OFA_FMA4_INTRINSICS_BROKEN) + message(STATUS "FMA4 disabled because of old/broken toolchain") + set(_fma4_broken true) + else() + set(_fma4_broken false) + endif() + if(DEFINED OFA_XOP_INTRINSICS_BROKEN AND OFA_XOP_INTRINSICS_BROKEN) + message(STATUS "XOP disabled because of old/broken toolchain") + set(_xop_broken true) + else() + set(_xop_broken false) + endif() + if(DEFINED OFA_AVX2_INTRINSICS_BROKEN AND OFA_AVX2_INTRINSICS_BROKEN) + message(STATUS "AVX2 disabled because of old/broken toolchain") + set(_avx2_broken true) + else() + set(_avx2_broken false) + endif() + if(DEFINED OFA_AVX512_INTRINSICS_BROKEN AND OFA_AVX512_INTRINSICS_BROKEN) + message(STATUS "AVX512 disabled because of old/broken toolchain") + set(_avx512_broken true) + else() + set(_avx512_broken false) + endif() + endif() + + # Enable/disable macro + macro(_enable_or_disable _name _flag _documentation _broken) + if(_broken) + set(_found false) + else() + _my_find(_available_extension_list "${_flag}" _found) + endif() + set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) + mark_as_advanced(USE_${_name}) + if(USE_${_name}) + list(APPEND _enable_extension_list "${_flag}") + else() + list(APPEND _disable_extension_list "${_flag}") + endif() + endmacro() + + # Enable/disable features + _enable_or_disable(AES "aes" "Use AES." _generic_broken) + _enable_or_disable(AMX-BF16 "amx-bf16" "Use AMX-BF16." _amx_broken) + _enable_or_disable(AMX-INT8 "amx-int8" "Use AMX-INT8." _amx_broken) + _enable_or_disable(AMX-TILE "amx-tile" "Use AMX-TILE." _amx_broken) + _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) + _enable_or_disable(AVX-VNNI "avx-vnni" "Use AVX-VNNI." _avxvnni_broken) + _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) + _enable_or_disable(AVX5124FMAPS "avx5124fmaps" "Use AVX5124FMAPS." _avx512_broken) + _enable_or_disable(AVX5124VNNIW "avx5124vnniw" "Use AVX5124VNNIW." _avx512_broken) + _enable_or_disable(AVX512BF16 "avx512bf16" "Use AVX512BF16." _avx512_broken) + _enable_or_disable(AVX512BITALG "avx512bitalg" "Use AVX512BITALG." _avx512_broken) + _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." _avx512_broken) + _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." _avx512_broken) + _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." _avx512_broken) + _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." _avx512_broken) + _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." _avx512_broken) + _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." _avx512_broken) + _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." _avx512_broken) + _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." _avx512_broken) + _enable_or_disable(AVX512VBMI2 "avx512vbmi2" "Use AVX512VBMI2." _avx512_broken) + _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) + _enable_or_disable(AVX512VNNI "avx512vnni" "Use AVX512VNNI." _avx512_broken) + _enable_or_disable(AVX512VP2INTERSECT "avx512vp2intersect" "Use AVX512VP2INTERSECT." _avx512_broken) + _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq " "Use AVX512VPOPCNTDQ ." _avx512_broken) + _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq" "Use AVX512VPOPCNTDQ." _avx512_broken) + _enable_or_disable(BMI "bmi2" "Use BMI." _avx_broken) + _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) + _enable_or_disable(CLDEMOTE "cldemote" "Use CLDEMOTE." _generic_broken) + _enable_or_disable(CLFLUSHOPT "clflushopt" "Use CLFLUSHOPT." _generic_broken) + _enable_or_disable(CLWB "clwb" "Use CLWB." _generic_broken) + _enable_or_disable(ENQCMD "enqcmd" "Use ENQCMD." _generic_broken) + _enable_or_disable(F16C "f16c" "Use F16C." _xop_broken) + _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) + _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) + _enable_or_disable(FSGSBASE "fsgsbase" "Use FSGSBASE." _generic_broken) + _enable_or_disable(GFNI "gfni" "Use GFNI." _avx512_broken) + _enable_or_disable(HRESET "hreset" "Use ." _avx512_broken) + _enable_or_disable(LZCNT "lzcnt" "Use LZCNT." _sse_broken) + _enable_or_disable(MOVDIR64B "movdir64b" "Use MOVDIR64B." _generic_broken) + _enable_or_disable(MOVDIRI "movdiri" "Use MOVDIRI." _generic_broken) + _enable_or_disable(PCLMUL "pclmul" "Use PCLMUL." _generic_broken) + _enable_or_disable(PCONFIG "pconfig" "Use PCONFIG." _generic_broken) + _enable_or_disable(PKU "pku" "Use PKU." _generic_broken) + _enable_or_disable(POPCNT "popcnt" "Use POPCNT." _sse_broken) + _enable_or_disable(PREFETCHWT1 "prefetchwt1" "Use PREFETCHWT1." _generic_broken) + _enable_or_disable(PRFCHW "prfchw" "Use PRFCHW." _generic_broken) + _enable_or_disable(PTWRITE "ptwrite" "Use PTWRITE." _generic_broken) + _enable_or_disable(RDPID "rdpid " "Use RDPID ." _generic_broken) + _enable_or_disable(RDRND "rdrnd" "Use RDRND." _generic_broken) + _enable_or_disable(RDSEED "rdseed" "Use RDSEED." _generic_broken) + _enable_or_disable(SERIALIZE "serialize" "Use SERIALIZE." _generic_broken) + _enable_or_disable(SGX "sgx" "Use SGX." _generic_broken) + _enable_or_disable(SHA "sha" "Use SHA." _generic_broken) + _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." _sse_broken) + _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." _sse_broken) + _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." _sse_broken) + _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." _sse_broken) + _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." _sse_broken) + _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." _sse_broken) + _enable_or_disable(TSXLDTRK "tsxldtrk" "Use TSXLDTRK." _generic_broken) + _enable_or_disable(VAES "vaes" "Use VAES." _avx512_broken) + _enable_or_disable(VPCLMULQDQ "vpclmulqdq" "Use VPCLMULQDQ." _avx512_broken) + _enable_or_disable(WAITPKG "waitpkg" "Use WAITPKG." _generic_broken) + _enable_or_disable(WBNOINVD "wbnoinvd" "Use WBNOINVD." _generic_broken) + _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) + _enable_or_disable(XSAVE "xsave" "Use XSAVE." _generic_broken) + _enable_or_disable(XSAVEC "xsavec" "Use XSAVEC." _generic_broken) + _enable_or_disable(XSAVEOPT "xsaveopt" "Use XSAVEOPT." _generic_broken) + _enable_or_disable(XSAVES "xsaves" "Use XSAVES." _generic_broken) +endif(FALSE) + + # Add compiler flags + if(MSVC AND MSVC_VERSION GREATER 1700) + _my_find(_enable_extension_list "avx512f" _found) + if(_found) + AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _my_find(_enable_extension_list "avx2" _found) + if(_found) + AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + if(NOT _found) + _my_find(_enable_extension_list "avx" _found) + if(_found) + AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + if(NOT _found) + _my_find(_enable_extension_list "sse2" _found) + if(_found) + AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + if(NOT _found) + _my_find(_enable_extension_list "sse" _found) + if(_found) + AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + foreach(_flag ${_enable_extension_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "." "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") + set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") + set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") + set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") + set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") + set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") + set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") + set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") + set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xKNL;-xMIC-AVX512") + set(OFA_map_knm "-xKNM;-xMIC-AVX512") + set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") + set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") + set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") + set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") + set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") + set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") + set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") + set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") + set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") + set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") + set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") + set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() + + message(${_march_flag_list}) + foreach(_arch ${_march_flag_list}) + message("arch ${_arch}") + if(DEFINED OFA_map_${_arch}) + message("${OFA_map_${_arch}}") + foreach(_flag ${OFA_map_${_arch}}) + message("flag ${_flag}") + AddCompilerFlag(${_flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + if(_ok) + message("OKAY") + break() + endif() + endforeach() + if(_ok) + break() + endif() + endif() + endforeach() + if(NOT _ok) + # This is the Intel compiler, so SSE2 is a very reasonable baseline. + message(STATUS "[OFA] Did not recognize the requested architecture flag ${arch}, falling back to SSE2") + if(WIN32) + AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + + # TODO PGI/Cray/SunPro ... + + else() + # Others: GNU, Clang and variants + + # Set -march flag + foreach(_flag ${_march_flag_list}) + AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Set -m flag for enabled features + foreach(_flag ${_enable_extension_list}) + AddCompilerFlag("-m${_flag}" CXX_RESULT _result) + if(_result) + set(_header FALSE) + if(_flag STREQUAL "sse3") + set(_header "pmmintrin.h") + elseif(_flag STREQUAL "ssse3") + set(_header "tmmintrin.h") + elseif(_flag STREQUAL "sse4.1") + set(_header "smmintrin.h") + elseif(_flag STREQUAL "sse4.2") + set(_header "nmmintrin.h") + elseif(_flag STREQUAL "sse4a") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "avx") + set(_header "immintrin.h") + elseif(_flag STREQUAL "avx2") + set(_header "immintrin.h") + elseif(_flag STREQUAL "avx512*") + set(_header "immintrin.h") + elseif(_flag STREQUAL "fma4") + set(_header "x86intrin.h") + elseif(_flag STREQUAL "xop") + set(_header "x86intrin.h") + elseif(_flag STREQUAL "bmi") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "bmi2") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "rdrnd") + set(_header "immintrin.h") + elseif(_flag STREQUAL "rdpid") + set(_header "immintrin.h") + elseif(_flag STREQUAL "rdseed") + set(_header "immintrin.h") + endif() + set(_resultVar "HAVE_${_header}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + if(_header) + CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") + if(NOT ${_resultVar}) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + message(STATUS "disabling ${_useVar} because ${_header} is missing") + set(${_useVar} FALSE) + list(APPEND _disable_extension_list "${_flag}") + endif() + endif() + if(NOT _header OR ${_resultVar}) + list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") + endif() + endif() + endforeach(_flag) + + # Set -mno-feature flag for disabled features + foreach(_flag ${_disable_extension_list}) + AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + endif() + endif() + + # Compile code with profiling instrumentation + if(TARGET_PROFILER STREQUAL "gprof") + AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + elseif(TARGET_PROFILER STREQUAL "vtune") + if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") + # Need to check if this also works on Windows + AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + + if(OFA_VERBOSE) + string(REPLACE ";" ", " _str "${OFA_ARCHITECTURE_FLAGS}") + message(STATUS "OFA_ARCHITECTURE_FLAGS: " ${_str}) + endif() + +endmacro(OFA_HandleX86Options) diff --git a/cmake/ofa/IntrinsicsX86.txt b/cmake/ofa/IntrinsicsX86.txt new file mode 100644 index 0000000000..fa3ee90d0b --- /dev/null +++ b/cmake/ofa/IntrinsicsX86.txt @@ -0,0 +1,92 @@ +# List of x86 intrisics to check +# Format: [,];
;; +# line starting with # are comments + +# MMX +mmx;mmintrin.h;_mm_add_pi16;__m64(),__m64() + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/FMA +sse;xmmintrin.h;_mm_add_ps;__m128(),__m128() +sse2;emmintrin.h;_mm_add_epi16;__m128i(),__m128i() +sse3;pmmintrin.h;_mm_addsub_pd;__m128d(),__m128d() +ssse3;tmmintrin.h;_mm_hadd_epi16;__m128i(),__m128i() +sse4.1;smmintrin.h;_mm_max_epi32;__m128i(),__m128i() +sse4.2;nmmintrin.h;_mm_cmpgt_epi64;__m128i(),__m128i() +sse4a;ammintrin.h;_mm_extract_si64;__m128i(),__m128i() +avx;immintrin.h;_mm256_add_pd;__m256d(),__m256d() +avx2;immintrin.h;_mm256_hadd_epi16;__m256i(),__m256i() +fma;immintrin.h;_mm_fmadd_pd;__m128d(),__m128d(),__m128d() + +# AVX-VNNI +avxvnni;immintrin.h;_mm_dpbusd_avx_epi32;__m128i(),__m128i(),__m128i() + +# AVX-512 +avx512f;immintrin.h;_mm512_abs_epi32;__m512i() +avx512bw;immintrin.h;_mm512_abs_epi16;__m512i() +avx512cd;immintrin.h;_mm512_broadcastmb_epi64;__mmask8() +avx512dq;immintrin.h;_mm512_and_pd;__m512d(),__m512d() +avx512er;immintrin.h;_mm512_exp2a23_pd;__m512d() +avx512ifma;immintrin.h;_mm512_maskz_madd52hi_epu64;__mmask8(),__m512i(),__m512i(),__m512i() +avx512pf;immintrin.h;_mm512_prefetch_i32scatter_pd;NULL,__m256i(),(int)1,_MM_HINT_T0 +avx512vl,avx512f;immintrin.h;_mm_abs_epi64;__m128i() +avx512vpopcntdq,avx512vl;immintrin.h;_mm_popcnt_epi64;__m128i() +avx5124fmaps;immintrin.h;_mm_4fmadd_ss;__m128(),__m128(),__m128(),__m128(),__m128(),new __m128[1] +avx5124vnniw;immintrin.h;_mm512_4dpwssd_epi32;__m512i(),__m512i(),__m512i(),__m512i(),__m512i(),new __m128i[1] +avx512bf16,avx512vl;immintrin.h;_mm_cvtne2ps_pbh;__m128(),__m128() +avx512bitalg,avx512vl;immintrin.h;_mm_popcnt_epi16;__m128i() +avx512vbmi;immintrin.h;_mm512_permutex2var_epi8;__m512i(),__m512i(),__m512i() +avx512vbmi2,avx512vl;immintrin.h;_mm_mask_compress_epi16;__m128i(),__mmask8(),__m128i() +avx512vnni,avx512vl;immintrin.h;_mm_dpbusd_epi32;__m128i(),__m128i(),__m128i() +avx512vp2intersect,avx512vl;immintrin.h;_mm_2intersect_epi32;__m128i(),__m128i(),new __mmask8[1],new __mmask8[1] +avx512fp16,avx512vl;immintrin.h;_mm_add_ph;__m128h(),__m128h() + +# AMX +amx-bf16;immintrin.h;_tile_dpbf16ps;__tile(),__tile(),__tile() +amx-int8;immintrin.h;_tile_dpbssd;__tile(),__tile(),__tile() +amx-tile;immintrin.h;_tile_zero;__tile() + +# Other +adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1] +aes;wmmintrin.h;_mm_aesdec_si128;__m128i(),__m128i() +bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 +bmi2;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1 +cldemote;immintrin.h;_mm_cldemote;(void const*)NULL +clflushopt;immintrin.h;_mm_clflushopt;(void const*)NULL +clwb;immintrin.h;_mm_clwb;(void const*)NULL +enqcmd;immintrin.h;_enqcmd;(void*)NULL,(void const*)NULL +fp16c;emmintrin.h;_mm_cvtph_ps;__m128i() +fsgsbase;immintrin.h;_readfsbase_u32; +fxsr;immintrin.h;_fxrstor;(void*)NULL +gfni,avx512vl;immintrin.h;_mm_gf2p8mul_epi8;__m128i(),__m128i() +hreset;immintrin.h;_hreset;1 +invpcid;immintrin.h;_invpcid;(unsigned int)1,(void*)NULL +keylocker;immintrin.h;_mm_aesdec128kl_u8;new __m128i[1],__m128i(),(const void*)NULL +keylocker_wide;immintrin.h;_mm_aesdecwide128kl_u8;new __m128i[1],(const __m128i*)new __m128i[1], (const void*)NULL +lzcnt;immintrin.h;_lzcnt_u32;(unsigned int)1 +monitor;pmmintrin.h;_mm_monitor;(void const*)NULL,(unsigned)1,(unsigned)1 +movbe;immintrin.h;_loadbe_i16;(void const*)NULL +movdir64b;immintrin.h;_movdir64b;(void*)NULL,(const void*)NULL +movdiri;immintrin.h;_directstoreu_u32;(void*)NULL,(unsigned int)1 +mpx;immintrin.h;_bnd_chk_ptr_lbounds;(const void*)NULL +pclmulqdq;wmmintrin.h;_mm_clmulepi64_si128;__m128i(),__m128i(),(const int)0 +pconfig;immintrin.h;_pconfig_u32;(const int)1,new size_t[1] +popcnt;immintrin.h;_popcnt32;(int)1 +prefetchwt1;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1 +rdpid;immintrin.h;_rdpid_u32; +rdrand;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1] +rdseed;immintrin.h;_rdseed16_step;(unsigned short*)new unsigned short[1] +rdtscp;immintrin.h;__rdtscp;(unsigned int*)NULL +rtm;immintrin.h;_xend; +serialize;immintrin.h;_serialize; +sha;immintrin.h;_mm_sha1msg1_epu32;__m128i(),__m128i() +tsc;immintrin.h;_rdtsc; +tsxldtrk;immintrin.h;_xresldtrk; +uintr;immintrin.h;_clui; +vaesavx512vl;immintrin.h;_mm256_aesdec_epi128;__m256i(),__m256i() +vpclmulqdq,avx512vl;immintrin.h;_mm256_clmulepi64_epi128;__m256i(),__m256i(),(const int)1 +waitpkg;immintrin.h;_umonitor;(void*)NULL +wbnoinvd;immintrin.h;_wbnoinvd; +xsave;immintrin.h;_xgetbv;(unsigned int)1 +xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned __int64)0 +xsaveopt,xsave;immintrin.h;xsaveopt;(void*)NULL,(unsigned __int64)0 +xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned __int64)0 From b3a476f2b24904e05508b0d7f799c6dcd271ee86 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Mon, 17 Jan 2022 22:13:37 +0100 Subject: [PATCH 140/174] Update OFA --- cmake/OptimizeForArchitecture.cmake | 16 +- cmake/ofa/CheckCXXCompilerFlag.cmake | 2 +- cmake/ofa/HandleX86Options.cmake | 1249 ++++++++++++-------------- 3 files changed, 564 insertions(+), 703 deletions(-) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index a6bb8e9460..0196db4fad 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -61,14 +61,6 @@ # - Added Support for ARM (Clang, GCC, ARM Clang, Cray, Fujitsu) #============================================================================= -include(ofa/AutodetectX86) -include(ofa/AutodetectArm) -include(ofa/AutodetectPpc) - -include(ofa/HandleX86Options) -include(ofa/HandleArmOptions) -include(ofa/HandlePpcOptions) - macro(_my_find _list _value _ret) list(FIND ${_list} "${_value}" _found) if(_found EQUAL -1) @@ -82,6 +74,10 @@ endmacro(_my_find) # Autodetection of CPU #============================================================================= +include(ofa/AutodetectX86) +include(ofa/AutodetectArm) +include(ofa/AutodetectPpc) + macro(OFA_AutodetectHostArchitecture) set(TARGET_ARCHITECTURE "generic") set(OFA_ARCHITECTURE_FLAGS) @@ -101,6 +97,10 @@ endmacro(OFA_AutodetectHostArchitecture) # Handling of CPU options #============================================================================= +include(ofa/HandleX86Options) +include(ofa/HandleArmOptions) +include(ofa/HandlePpcOptions) + macro(OptimizeForArchitecture) message(STATUS "Optimizing for target architecture") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") diff --git a/cmake/ofa/CheckCXXCompilerFlag.cmake b/cmake/ofa/CheckCXXCompilerFlag.cmake index e3b0188a44..5628e984f0 100644 --- a/cmake/ofa/CheckCXXCompilerFlag.cmake +++ b/cmake/ofa/CheckCXXCompilerFlag.cmake @@ -47,7 +47,7 @@ MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) if(${ARGC} GREATER 2) SET(TEST_SOURCE "${ARGV2}") else() - SET(TEST_SOURCE "int main() { return 0;}") + SET(TEST_SOURCE "int main() { return 0; }") endif() CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index 2a6ece54f1..02892987a7 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -48,737 +48,596 @@ macro(OFA_HandleX86Options) AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - return() - endif() - - # Step 1: Generate a list of compiler flags for the specific CPU - set(_march_flag_list) - set(_available_extension_list) - - # Define macros for Intel - macro(_nehalem) - list(APPEND _march_flag_list "nehalem") - list(APPEND _march_flag_list "corei7") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") - endmacro() - macro(_westmere) - list(APPEND _march_flag_list "westmere") - _nehalem() - list(APPEND _available_extension_list "aes" "pclmul") - endmacro() - macro(_sandybridge) - list(APPEND _march_flag_list "sandybridge") - list(APPEND _march_flag_list "corei7-avx") - _westmere() - list(APPEND _available_extension_list "avx") - endmacro() - macro(_ivybridge) - list(APPEND _march_flag_list "ivybridge") - list(APPEND _march_flag_list "core-avx-i") - _sandybridge() - list(APPEND _available_extension_list "rdrnd" "f16c" "fsgsbase") - endmacro() - macro(_haswell) - list(APPEND _march_flag_list "haswell") - list(APPEND _march_flag_list "core-avx2") - _ivybridge() - list(APPEND _available_extension_list "avx2" "fma" "bmi" "bmi2") - endmacro() - macro(_broadwell) - list(APPEND _march_flag_list "broadwell") - _haswell() - list(APPEND _available_extension_list "rdseed" "adcx" "prfchw") - endmacro() - macro(_skylake) - list(APPEND _march_flag_list "skylake") - _broadwell() - list(APPEND _available_extension_list "clflushopt" "xsavec" "xsaves") - endmacro() - macro(_skylake_avx512) - list(APPEND _march_flag_list "skylake-avx512") - _skylake() - list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") - endmacro() - macro(_cascadelake) - list(APPEND _march_flag_list "cascadelake") - _skylake_avx512() - list(APPEND _available_extension_list "avx512vnni") - endmacro() - macro(_cooperlake) - list(APPEND _march_flag_list "cooperlake") - _cascadelake() - list(APPEND _available_extension_list "avx512bf16") - endmacro() - macro(_cannonlake) - list(APPEND _march_flag_list "cannonlake") - _skylake() - list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") - endmacro() - macro(_icelake) - list(APPEND _march_flag_list "icelake-client") - _cannonlake() - list(APPEND _available_extension_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") - endmacro() - macro(_icelake_avx512) - list(APPEND _march_flag_list "icelake-server") - _icelake() - list(APPEND _available_extension_list "pconfig" "wbnoinvd") - endmacro() - macro(_tigerlake) - list(APPEND _march_flag_list "tigerlake") - _icelake() - list(APPEND _available_extension_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") - endmacro() - macro(_alderlake) - list(APPEND _march_flag_list "alderlake") - _broadwell() - list(APPEND _available_extension_list "avxvnni" "cldemote" "clwb" "gfni" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") - endmacro() - macro(_sapphirerapids) - list(APPEND _march_flag_list "sapphirerapids") - _skylake_avx512() - list(APPEND _available_extension_list "amx-bf16" "amx-int8" "amx-tile" "avxvnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") - endmacro() - macro(_rocketlake) - list(APPEND _march_flag_list "rocketlake") - _skylake_avx512() - list(APPEND _available_extension_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") - endmacro() - macro(_knightslanding) - list(APPEND _march_flag_list "knl") - _broadwell() - list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd") - endmacro() - macro(_knightsmill) - list(APPEND _march_flag_list "knm") - _broadwell() - list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") - endmacro() - macro(_silvermont) - list(APPEND _march_flag_list "silvermont") - _westmere() - list(APPEND _available_extension_list "rdrnd") - endmacro() - macro(_goldmont) - list(APPEND _march_flag_list "goldmont") - _silvermont() - list(APPEND _available_extension_list "rdseed") - endmacro() - macro(_goldmont_plus) - list(APPEND _march_flag_list "goldmont-plus") - _goldmont() - list(APPEND _available_extension_list "rdpid") - endmacro() - macro(_tremont) - list(APPEND _march_flag_list "tremont") - _goldmont_plus() - endmacro() - - # TODO: Define similar macros for AMD - - # Intel - if(TARGET_ARCHITECTURE STREQUAL "core") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "merom") - list(APPEND _march_flag_list "merom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "penryn") - list(APPEND _march_flag_list "penryn") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - message(STATUS "[OFA] Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") - if(_cpu_flags MATCHES "sse4_1") - message(STATUS "[OFA] SSE4.1: enabled (auto-detected from this computer's CPU flags)") - list(APPEND _available_extension_list "sse4.1") - else() - message(STATUS "[OFA] SSE4.1: disabled (auto-detected from this computer's CPU flags)") - endif() - elseif(TARGET_ARCHITECTURE STREQUAL "knm") - _knightsmill() - elseif(TARGET_ARCHITECTURE STREQUAL "knl") - _knightslanding() - elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") - _rocketlake() - elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") - _sapphirerapids() - elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") - _alderlake() - elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") - _tigerlake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake") - _icelake() - elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") - _icelake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") - _cannonlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") - _cooperlake() - elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") - _cascadelake() - elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") - _skylake_avx512() - elseif(TARGET_ARCHITECTURE STREQUAL "skylake") - _skylake() - elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") - _broadwell() - elseif(TARGET_ARCHITECTURE STREQUAL "haswell") - _haswell() - elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") - _ivybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") - _sandybridge() - elseif(TARGET_ARCHITECTURE STREQUAL "westmere") - _westmere() - elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") - _nehalem() - elseif(TARGET_ARCHITECTURE STREQUAL "tremont") - _tremont() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") - _goldmont_plus() - elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") - _goldmont() - elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") - _silvermont() - elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") - list(APPEND _march_flag_list "bonnell") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - elseif(TARGET_ARCHITECTURE STREQUAL "atom") - list(APPEND _march_flag_list "atom") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - - # AMD - elseif(TARGET_ARCHITECTURE STREQUAL "k8") - list(APPEND _march_flag_list "k8") - list(APPEND _available_extension_list "sse" "sse2") - elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") - list(APPEND _march_flag_list "k8-sse3") - list(APPEND _march_flag_list "k8") - list(APPEND _available_extension_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") - list(APPEND _march_flag_list "btver2") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") - elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen3") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen2") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "excavator") - list(APPEND _march_flag_list "bdver4") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") - elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - - # Others - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() + + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") + + # Step 1: Generate a list of compiler flags for the specific CPU + set(_march_flag_list) + set(_available_extension_list) + + # Define macros for Intel + macro(_nehalem) + list(APPEND _march_flag_list "nehalem") + list(APPEND _march_flag_list "corei7") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") + endmacro() + macro(_westmere) + list(APPEND _march_flag_list "westmere") + _nehalem() + list(APPEND _available_extension_list "aes" "pclmul") + endmacro() + macro(_sandybridge) + list(APPEND _march_flag_list "sandybridge") + list(APPEND _march_flag_list "corei7-avx") + _westmere() + list(APPEND _available_extension_list "avx") + endmacro() + macro(_ivybridge) + list(APPEND _march_flag_list "ivybridge") + list(APPEND _march_flag_list "core-avx-i") + _sandybridge() + list(APPEND _available_extension_list "rdrnd" "f16c" "fsgsbase") + endmacro() + macro(_haswell) + list(APPEND _march_flag_list "haswell") + list(APPEND _march_flag_list "core-avx2") + _ivybridge() + list(APPEND _available_extension_list "avx2" "fma" "bmi" "bmi2") + endmacro() + macro(_broadwell) + list(APPEND _march_flag_list "broadwell") + _haswell() + list(APPEND _available_extension_list "rdseed" "adcx" "prfchw") + endmacro() + macro(_skylake) + list(APPEND _march_flag_list "skylake") + _broadwell() + list(APPEND _available_extension_list "clflushopt" "xsavec" "xsaves") + endmacro() + macro(_skylake_avx512) + list(APPEND _march_flag_list "skylake-avx512") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku") + endmacro() + macro(_cascadelake) + list(APPEND _march_flag_list "cascadelake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512vnni") + endmacro() + macro(_cooperlake) + list(APPEND _march_flag_list "cooperlake") + _cascadelake() + list(APPEND _available_extension_list "avx512bf16") + endmacro() + macro(_cannonlake) + list(APPEND _march_flag_list "cannonlake") + _skylake() + list(APPEND _available_extension_list "avx512bw" "avx512cd" "avx512dq" "avx512f" "avx512vl" "clwb" "pku" "avx512ifma" "avx512vbmi" "sha" "umip") + endmacro() + macro(_icelake) + list(APPEND _march_flag_list "icelake-client") + _cannonlake() + list(APPEND _available_extension_list "avx512bitalg" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "clwb" "gfni" "rdpid" "vaes" "vpclmulqdq") + endmacro() + macro(_icelake_avx512) + list(APPEND _march_flag_list "icelake-server") + _icelake() + list(APPEND _available_extension_list "pconfig" "wbnoinvd") + endmacro() + macro(_tigerlake) + list(APPEND _march_flag_list "tigerlake") + _icelake() + list(APPEND _available_extension_list "avx512vp2intersect" "keylocker" "movdir64b" "movdiri" "pconfig" "wbnoinvd") + endmacro() + macro(_alderlake) + list(APPEND _march_flag_list "alderlake") + _broadwell() + list(APPEND _available_extension_list "avxvnni" "cldemote" "clwb" "gfni" "hreset" "kl" "lzcnt" "movdir64b" "movdiri" "pconfig" "pku" "ptwrite" "rdpid" "serialize" "sgx" "umip" "vaes" "vpclmulqdq" "waitpkg" "widekl" "xsave" "xsavec" "xsaveopt" "xsaves") + endmacro() + macro(_sapphirerapids) + list(APPEND _march_flag_list "sapphirerapids") + _skylake_avx512() + list(APPEND _available_extension_list "amx-bf16" "amx-int8" "amx-tile" "avxvnni" "avx512bf16" "avx512vnni" "avx512vp2intersect" "cldemote" "enqcmd" "movdir64b" "movdiri" "ptwrite" "serialize" "tsxldtrk" "uintr" "waitpkg") + endmacro() + macro(_rocketlake) + list(APPEND _march_flag_list "rocketlake") + _skylake_avx512() + list(APPEND _available_extension_list "avx512bitalg" "avx512ifma" "avx512vbmi" "avx512vbmi2" "avx512vnni" "avx512vpopcntdq" "gfni" "rdpid" "sha" "umip" "vaes" "vpclmulqdq") + endmacro() + macro(_knightslanding) + list(APPEND _march_flag_list "knl") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd") + endmacro() + macro(_knightsmill) + list(APPEND _march_flag_list "knm") + _broadwell() + list(APPEND _available_extension_list "avx512f" "avx512pf" "avx512er" "avx512cd" "avx5124fmaps" "avx5124vnni" "avx512vpopcntdq") + endmacro() + macro(_silvermont) + list(APPEND _march_flag_list "silvermont") + _westmere() + list(APPEND _available_extension_list "rdrnd") + endmacro() + macro(_goldmont) + list(APPEND _march_flag_list "goldmont") + _silvermont() + list(APPEND _available_extension_list "rdseed") + endmacro() + macro(_goldmont_plus) + list(APPEND _march_flag_list "goldmont-plus") + _goldmont() + list(APPEND _available_extension_list "rdpid") + endmacro() + macro(_tremont) + list(APPEND _march_flag_list "tremont") + _goldmont_plus() + endmacro() - list(SORT _available_extension_list) - list(REMOVE_DUPLICATES _available_extension_list) - - if(OFA_VERBOSE) - string(REPLACE ";" ", " _str "${_march_flag_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] CPU architectures: " ${_str}) - string(REPLACE ";" ", " _str "${_available_extension_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (available): ${_str}") - endif() - - # Step 2: Apply compiler flags - if(NOT TARGET_ARCHITECTURE STREQUAL "none") - set(_check_extension_list) - set(_disable_extension_list) - set(_enable_extension_list) + # TODO: Define similar macros for AMD - # Step 2.1: Enable/disable features based on compiler capabilities - file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/IntrinsicsX86.txt _intrinsics) - string(REPLACE ";" "|" _intrinsics "${_intrinsics}") - string(REPLACE "\n" ";" _intrinsics "${_intrinsics}") - foreach (_intrinsic ${_intrinsics}) - if ("${_intrinsic}" MATCHES "^#" ) # Skip comment - continue() + # Intel + if(TARGET_ARCHITECTURE STREQUAL "core" OR TARGET_ARCHITECTURE STREQUAL "core2") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3") + elseif(TARGET_ARCHITECTURE STREQUAL "merom") + list(APPEND _march_flag_list "merom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "penryn") + list(APPEND _march_flag_list "penryn") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + message(STATUS "[OFA] Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") + if(_cpu_flags MATCHES "sse4_1") + message(STATUS "[OFA] SSE4.1: enabled (auto-detected from this computer's CPU flags)") + list(APPEND _available_extension_list "sse4.1") + else() + message(STATUS "[OFA] SSE4.1: disabled (auto-detected from this computer's CPU flags)") endif() - string(REPLACE "|" ";" _intrinsic "${_intrinsic}") - list(GET _intrinsic 0 _intrinsic_flags) - list(GET _intrinsic 1 _intrinsic_header) - list(GET _intrinsic 2 _intrinsic_name) - list(GET _intrinsic 3 _intrinsic_params) + elseif(TARGET_ARCHITECTURE STREQUAL "knm") + _knightsmill() + elseif(TARGET_ARCHITECTURE STREQUAL "knl") + _knightslanding() + elseif(TARGET_ARCHITECTURE STREQUAL "rocketlake") + _rocketlake() + elseif(TARGET_ARCHITECTURE STREQUAL "sapphirerapids") + _sapphirerapids() + elseif(TARGET_ARCHITECTURE STREQUAL "alderlake") + _alderlake() + elseif(TARGET_ARCHITECTURE STREQUAL "tigerlake") + _tigerlake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake") + _icelake() + elseif(TARGET_ARCHITECTURE STREQUAL "icelake-xeon" OR TARGET_ARCHITECTURE STREQUAL "icelake-avx512") + _icelake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "cannonlake") + _cannonlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cooperlake") + _cooperlake() + elseif(TARGET_ARCHITECTURE STREQUAL "cascadelake") + _cascadelake() + elseif(TARGET_ARCHITECTURE STREQUAL "kabylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake-xeon" OR TARGET_ARCHITECTURE STREQUAL "skylake-avx512") + _skylake_avx512() + elseif(TARGET_ARCHITECTURE STREQUAL "skylake") + _skylake() + elseif(TARGET_ARCHITECTURE STREQUAL "broadwell") + _broadwell() + elseif(TARGET_ARCHITECTURE STREQUAL "haswell") + _haswell() + elseif(TARGET_ARCHITECTURE STREQUAL "ivybridge") + _ivybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "sandybridge") + _sandybridge() + elseif(TARGET_ARCHITECTURE STREQUAL "westmere") + _westmere() + elseif(TARGET_ARCHITECTURE STREQUAL "nehalem") + _nehalem() + elseif(TARGET_ARCHITECTURE STREQUAL "tremont") + _tremont() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont-plus") + _goldmont_plus() + elseif(TARGET_ARCHITECTURE STREQUAL "goldmont") + _goldmont() + elseif(TARGET_ARCHITECTURE STREQUAL "silvermont") + _silvermont() + elseif(TARGET_ARCHITECTURE STREQUAL "bonnell") + list(APPEND _march_flag_list "bonnell") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + elseif(TARGET_ARCHITECTURE STREQUAL "atom") + list(APPEND _march_flag_list "atom") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - string(REPLACE "," ";" _intrinsic_flags "${_intrinsic_flags}") - list(GET _intrinsic_flags 0 _flag) - string(REPLACE ";" " -m" _intrinsic_flags "-m${_intrinsic_flags}") - list(APPEND _check_extension_list "${_flag}") - - # Check if include file is available - set(_resultVar "HAVE_${_intrinsic_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - check_include_file_cxx( - ${_intrinsic_header} - ${_resultVar} - ${_intrinsic_flags}) - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "[OFA] Disabling ${_useVar} because ${_intrinsic_header} is missing") - set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) - mark_as_advanced(${_useVar}) - continue() - endif() + # AMD + elseif(TARGET_ARCHITECTURE STREQUAL "k8") + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "sse" "sse2") + elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") + list(APPEND _march_flag_list "k8-sse3") + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "sse" "sse2" "sse3") + elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") + list(APPEND _march_flag_list "btver2") + list(APPEND _march_flag_list "btver1") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") + elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") + list(APPEND _march_flag_list "btver1") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen3") + list(APPEND _march_flag_list "znver2") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen2") + list(APPEND _march_flag_list "znver2") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "zen") + list(APPEND _march_flag_list "znver1") + _skylake() + list(APPEND _available_extension_list "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "excavator") + list(APPEND _march_flag_list "bdver4") + list(APPEND _march_flag_list "bdver3") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") + elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") + list(APPEND _march_flag_list "bdver3") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") + elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") + list(APPEND _march_flag_list "bdver2") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") + elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") + elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") + list(APPEND _march_flag_list "bdver1") + list(APPEND _march_flag_list "bulldozer") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") + elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") + list(APPEND _march_flag_list "barcelona") + list(APPEND _march_flag_list "core2") + list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - # Check if compiler supports flag and can compile code - set(_resultVar "HAVE_${_flag}_${_intrinsic_name}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - set(CMAKE_REQUIRED_FLAGS "${_intrinsic_flags}") - check_cxx_source_compiles( - "#include<${_intrinsic_header}> - int main() { - ${_intrinsic_name}(${_intrinsic_params}); - return 0; - }" - ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - if (NOT ${_resultVar}) - message(STATUS "[OFA] Disabling ${_useVar} because -m${_flag} is not supported by compiler and/or ${_intrinsic_name} intrinsics fails to compile") - set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) - mark_as_advanced(${_useVar}) - else() - set(${_useVar} true CACHE BOOL "Use ${_flag} extension.") - mark_as_advanced(${_useVar}) - endif() - endforeach() + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause - # Generate lists of enabled/disabled flags - list(REMOVE_DUPLICATES _check_extension_list) - foreach(_flag ${_check_extension_list}) - _my_find(_available_extension_list "${_flag}" _found) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - if(_found AND ${_useVar}) - list(APPEND _enable_extension_list "${_flag}") - else() - list(APPEND _disable_extension_list "${_flag}") - endif() - endforeach() + else() + message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + list(SORT _available_extension_list) + list(REMOVE_DUPLICATES _available_extension_list) if(OFA_VERBOSE) - string(REPLACE ";" ", " _str "${_enable_extension_list}") + string(REPLACE ";" ", " _str "${_march_flag_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (enabled): ${_str}") - string(REPLACE ";" ", " _str "${_disable_extension_list}") + message(STATUS "[OFA] CPU architectures: " ${_str}) + string(REPLACE ";" ", " _str "${_available_extension_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (disabled): ${_str}") + message(STATUS "[OFA] Extensions (available): ${_str}") endif() + + # Step 2: Apply compiler flags + if(NOT TARGET_ARCHITECTURE STREQUAL "none") + set(_check_extension_list) + set(_disable_extension_list) + set(_enable_extension_list) -if(FALSE) - set(_generic_broken false) - set(_amx_broken false) - set(_avxvnni_broken false) + # Step 2.1: Enable/disable features based on compiler capabilities + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/IntrinsicsX86.txt _intrinsics) + string(REPLACE ";" "|" _intrinsics "${_intrinsics}") + string(REPLACE "\n" ";" _intrinsics "${_intrinsics}") + foreach (_intrinsic ${_intrinsics}) + if ("${_intrinsic}" MATCHES "^#" ) # Skip comment + continue() + endif() + string(REPLACE "|" ";" _intrinsic "${_intrinsic}") + list(GET _intrinsic 0 _intrinsic_flags) + list(GET _intrinsic 1 _intrinsic_header) + list(GET _intrinsic 2 _intrinsic_name) + list(GET _intrinsic 3 _intrinsic_params) - - if(DEFINED OFA_SSE_INTRINSICS_BROKEN AND OFA_SSE_INTRINSICS_BROKEN) - message(STATUS "SSE disabled because of old/broken toolchain") - set(_sse_broken true) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - elseif(DEFINED OFA_AVX_INTRINSICS_BROKEN AND OFA_AVX_INTRINSICS_BROKEN) - message(STATUS "AVX disabled because of old/broken toolchain") - set(_sse_broken false) - set(_avx_broken true) - set(_avx2_broken true) - set(_avx512_broken true) - set(_fma4_broken true) - set(_xop_broken true) - else() - set(_sse_broken false) - set(_avx_broken false) - if(DEFINED OFA_FMA4_INTRINSICS_BROKEN AND OFA_FMA4_INTRINSICS_BROKEN) - message(STATUS "FMA4 disabled because of old/broken toolchain") - set(_fma4_broken true) - else() - set(_fma4_broken false) - endif() - if(DEFINED OFA_XOP_INTRINSICS_BROKEN AND OFA_XOP_INTRINSICS_BROKEN) - message(STATUS "XOP disabled because of old/broken toolchain") - set(_xop_broken true) - else() - set(_xop_broken false) - endif() - if(DEFINED OFA_AVX2_INTRINSICS_BROKEN AND OFA_AVX2_INTRINSICS_BROKEN) - message(STATUS "AVX2 disabled because of old/broken toolchain") - set(_avx2_broken true) - else() - set(_avx2_broken false) - endif() - if(DEFINED OFA_AVX512_INTRINSICS_BROKEN AND OFA_AVX512_INTRINSICS_BROKEN) - message(STATUS "AVX512 disabled because of old/broken toolchain") - set(_avx512_broken true) - else() - set(_avx512_broken false) - endif() - endif() + string(REPLACE "," ";" _intrinsic_flags "${_intrinsic_flags}") + list(GET _intrinsic_flags 0 _flag) + string(REPLACE ";" " -m" _intrinsic_flags "-m${_intrinsic_flags}") + list(APPEND _check_extension_list "${_flag}") + + # Check if include file is available + set(_resultVar "HAVE_${_intrinsic_header}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + check_include_file_cxx( + ${_intrinsic_header} + ${_resultVar} + ${_intrinsic_flags}) + if(NOT ${_resultVar}) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + message(STATUS "[OFA] Disabling ${_useVar} because ${_intrinsic_header} is missing") + set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) + mark_as_advanced(${_useVar}) + continue() + endif() - # Enable/disable macro - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() + # Check if compiler supports flag and can compile code + set(_resultVar "HAVE_${_flag}_${_intrinsic_name}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + set(CMAKE_REQUIRED_FLAGS "${_intrinsic_flags}") + check_cxx_source_compiles( + "#include<${_intrinsic_header}> + int main() { + ${_intrinsic_name}(${_intrinsic_params}); + return 0; + }" + ${_resultVar}) + unset(CMAKE_REQUIRED_FLAGS) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + if (NOT ${_resultVar}) + message(STATUS "[OFA] Disabling ${_useVar} because -m${_flag} is not supported by compiler and/or ${_intrinsic_name} intrinsics fails to compile") + set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) + mark_as_advanced(${_useVar}) + else() + set(${_useVar} true CACHE BOOL "Use ${_flag} extension.") + mark_as_advanced(${_useVar}) + endif() + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_list) + foreach(_flag ${_check_extension_list}) _my_find(_available_extension_list "${_flag}" _found) - endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_extension_list "${_flag}") - else() - list(APPEND _disable_extension_list "${_flag}") - endif() - endmacro() - - # Enable/disable features - _enable_or_disable(AES "aes" "Use AES." _generic_broken) - _enable_or_disable(AMX-BF16 "amx-bf16" "Use AMX-BF16." _amx_broken) - _enable_or_disable(AMX-INT8 "amx-int8" "Use AMX-INT8." _amx_broken) - _enable_or_disable(AMX-TILE "amx-tile" "Use AMX-TILE." _amx_broken) - _enable_or_disable(AVX "avx" "Use AVX. This will all floating-point vector sizes relative to SSE." _avx_broken) - _enable_or_disable(AVX-VNNI "avx-vnni" "Use AVX-VNNI." _avxvnni_broken) - _enable_or_disable(AVX2 "avx2" "Use AVX2. This will double all of the vector sizes relative to SSE." _avx2_broken) - _enable_or_disable(AVX5124FMAPS "avx5124fmaps" "Use AVX5124FMAPS." _avx512_broken) - _enable_or_disable(AVX5124VNNIW "avx5124vnniw" "Use AVX5124VNNIW." _avx512_broken) - _enable_or_disable(AVX512BF16 "avx512bf16" "Use AVX512BF16." _avx512_broken) - _enable_or_disable(AVX512BITALG "avx512bitalg" "Use AVX512BITALG." _avx512_broken) - _enable_or_disable(AVX512BW "avx512bw" "Use AVX512BW." _avx512_broken) - _enable_or_disable(AVX512CD "avx512cd" "Use AVX512CD." _avx512_broken) - _enable_or_disable(AVX512DQ "avx512dq" "Use AVX512DQ." _avx512_broken) - _enable_or_disable(AVX512ER "avx512er" "Use AVX512ER. This enables exponential and reciprocal instructions." _avx512_broken) - _enable_or_disable(AVX512F "avx512f" "Use AVX512F. This will double all floating-point vector sizes relative to AVX2." _avx512_broken) - _enable_or_disable(AVX512IFMA "avx512ifma" "Use AVX512IFMA." _avx512_broken) - _enable_or_disable(AVX512PF "avx512pf" "Use AVX512PF. This enables prefetch instructions for gathers and scatters." _avx512_broken) - _enable_or_disable(AVX512VBMI "avx512vbmi" "Use AVX512VBMI." _avx512_broken) - _enable_or_disable(AVX512VBMI2 "avx512vbmi2" "Use AVX512VBMI2." _avx512_broken) - _enable_or_disable(AVX512VL "avx512vl" "Use AVX512VL. This enables 128- and 256-bit vector length instructions with EVEX coding (improved write-masking & more vector registers)." _avx2_broken) - _enable_or_disable(AVX512VNNI "avx512vnni" "Use AVX512VNNI." _avx512_broken) - _enable_or_disable(AVX512VP2INTERSECT "avx512vp2intersect" "Use AVX512VP2INTERSECT." _avx512_broken) - _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq " "Use AVX512VPOPCNTDQ ." _avx512_broken) - _enable_or_disable(AVX512VPOPCNTDQ "avx512vpopcntdq" "Use AVX512VPOPCNTDQ." _avx512_broken) - _enable_or_disable(BMI "bmi2" "Use BMI." _avx_broken) - _enable_or_disable(BMI2 "bmi2" "Use BMI2." _avx_broken) - _enable_or_disable(CLDEMOTE "cldemote" "Use CLDEMOTE." _generic_broken) - _enable_or_disable(CLFLUSHOPT "clflushopt" "Use CLFLUSHOPT." _generic_broken) - _enable_or_disable(CLWB "clwb" "Use CLWB." _generic_broken) - _enable_or_disable(ENQCMD "enqcmd" "Use ENQCMD." _generic_broken) - _enable_or_disable(F16C "f16c" "Use F16C." _xop_broken) - _enable_or_disable(FMA "fma" "Use FMA." _avx_broken) - _enable_or_disable(FMA4 "fma4" "Use FMA4." _fma4_broken) - _enable_or_disable(FSGSBASE "fsgsbase" "Use FSGSBASE." _generic_broken) - _enable_or_disable(GFNI "gfni" "Use GFNI." _avx512_broken) - _enable_or_disable(HRESET "hreset" "Use ." _avx512_broken) - _enable_or_disable(LZCNT "lzcnt" "Use LZCNT." _sse_broken) - _enable_or_disable(MOVDIR64B "movdir64b" "Use MOVDIR64B." _generic_broken) - _enable_or_disable(MOVDIRI "movdiri" "Use MOVDIRI." _generic_broken) - _enable_or_disable(PCLMUL "pclmul" "Use PCLMUL." _generic_broken) - _enable_or_disable(PCONFIG "pconfig" "Use PCONFIG." _generic_broken) - _enable_or_disable(PKU "pku" "Use PKU." _generic_broken) - _enable_or_disable(POPCNT "popcnt" "Use POPCNT." _sse_broken) - _enable_or_disable(PREFETCHWT1 "prefetchwt1" "Use PREFETCHWT1." _generic_broken) - _enable_or_disable(PRFCHW "prfchw" "Use PRFCHW." _generic_broken) - _enable_or_disable(PTWRITE "ptwrite" "Use PTWRITE." _generic_broken) - _enable_or_disable(RDPID "rdpid " "Use RDPID ." _generic_broken) - _enable_or_disable(RDRND "rdrnd" "Use RDRND." _generic_broken) - _enable_or_disable(RDSEED "rdseed" "Use RDSEED." _generic_broken) - _enable_or_disable(SERIALIZE "serialize" "Use SERIALIZE." _generic_broken) - _enable_or_disable(SGX "sgx" "Use SGX." _generic_broken) - _enable_or_disable(SHA "sha" "Use SHA." _generic_broken) - _enable_or_disable(SSE2 "sse2" "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." _sse_broken) - _enable_or_disable(SSE3 "sse3" "Use SSE3. If SSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_1 "sse4.1" "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4_2 "sse4.2" "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSE4a "sse4a" "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(SSSE3 "ssse3" "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." _sse_broken) - _enable_or_disable(TSXLDTRK "tsxldtrk" "Use TSXLDTRK." _generic_broken) - _enable_or_disable(VAES "vaes" "Use VAES." _avx512_broken) - _enable_or_disable(VPCLMULQDQ "vpclmulqdq" "Use VPCLMULQDQ." _avx512_broken) - _enable_or_disable(WAITPKG "waitpkg" "Use WAITPKG." _generic_broken) - _enable_or_disable(WBNOINVD "wbnoinvd" "Use WBNOINVD." _generic_broken) - _enable_or_disable(XOP "xop" "Use XOP." _xop_broken) - _enable_or_disable(XSAVE "xsave" "Use XSAVE." _generic_broken) - _enable_or_disable(XSAVEC "xsavec" "Use XSAVEC." _generic_broken) - _enable_or_disable(XSAVEOPT "xsaveopt" "Use XSAVEOPT." _generic_broken) - _enable_or_disable(XSAVES "xsaves" "Use XSAVES." _generic_broken) -endif(FALSE) - - # Add compiler flags - if(MSVC AND MSVC_VERSION GREATER 1700) - _my_find(_enable_extension_list "avx512f" _found) - if(_found) - AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() - if(NOT _found) - _my_find(_enable_extension_list "avx2" _found) - if(_found) - AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + if(_found AND ${_useVar}) + list(APPEND _enable_extension_list "${_flag}") + else() + list(APPEND _disable_extension_list "${_flag}") endif() + endforeach() + + if(OFA_VERBOSE) + string(REPLACE ";" ", " _str "${_enable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (enabled): ${_str}") + string(REPLACE ";" ", " _str "${_disable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (disabled): ${_str}") endif() - if(NOT _found) - _my_find(_enable_extension_list "avx" _found) + + # Add compiler flags + if(MSVC AND MSVC_VERSION GREATER 1700) + _my_find(_enable_extension_list "avx512f" _found) if(_found) - AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() - endif() - if(NOT _found) - _my_find(_enable_extension_list "sse2" _found) - if(_found) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(NOT _found) + _my_find(_enable_extension_list "avx2" _found) + if(_found) + AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() endif() - endif() - if(NOT _found) - _my_find(_enable_extension_list "sse" _found) - if(_found) - AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(NOT _found) + _my_find(_enable_extension_list "avx" _found) + if(_found) + AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() endif() - endif() - foreach(_flag ${_enable_extension_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) + if(NOT _found) + _my_find(_enable_extension_list "sse2" _found) + if(_found) + AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + if(NOT _found) + _my_find(_enable_extension_list "sse" _found) + if(_found) + AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + foreach(_flag ${_enable_extension_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "." "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" - OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") - if(WIN32) - # Intel (on Windows) - set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") - set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") - set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") - set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") - set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") - set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") - set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") - set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") - set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") - set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") - set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") - set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") - set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") - set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") - set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") - set(OFA_map_westmere "-QxSSE4.2") - set(OFA_map_nehalem "-QxSSE4.2") - set(OFA_map_penryn "-QxSSSE3") - set(OFA_map_merom "-QxSSSE3") - set(OFA_map_core2 "-QxSSE3") - set(_ok FALSE) - else() - # Intel (in Linux) - set(OFA_map_knl "-xKNL;-xMIC-AVX512") - set(OFA_map_knm "-xKNM;-xMIC-AVX512") - set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") - set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") - set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") - set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") - set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") - set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") - set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") - set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") - set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") - set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") - set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") - set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) - endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") + set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") + set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") + set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") + set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") + set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") + set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") + set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") + set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xKNL;-xMIC-AVX512") + set(OFA_map_knm "-xKNM;-xMIC-AVX512") + set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") + set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") + set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") + set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") + set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") + set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") + set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") + set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") + set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") + set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") + set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") + set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() - message(${_march_flag_list}) - foreach(_arch ${_march_flag_list}) - message("arch ${_arch}") - if(DEFINED OFA_map_${_arch}) - message("${OFA_map_${_arch}}") - foreach(_flag ${OFA_map_${_arch}}) - message("flag ${_flag}") - AddCompilerFlag(${_flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + foreach(_arch ${_march_flag_list}) + if(DEFINED OFA_map_${_arch}) + foreach(_flag ${OFA_map_${_arch}}) + AddCompilerFlag(${_flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + if(_ok) + break() + endif() + endforeach() if(_ok) - message("OKAY") break() endif() - endforeach() - if(_ok) - break() + endif() + endforeach() + if(NOT _ok) + # This is the Intel compiler, so SSE2 is a very reasonable baseline. + message(STATUS "[OFA] Did not recognize the requested architecture flag ${_arch}, falling back to SSE2") + if(WIN32) + AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - endforeach() - if(NOT _ok) - # This is the Intel compiler, so SSE2 is a very reasonable baseline. - message(STATUS "[OFA] Did not recognize the requested architecture flag ${arch}, falling back to SSE2") - if(WIN32) - AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - else() - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endif() - endif() - # TODO PGI/Cray/SunPro ... + # TODO PGI/Cray/SunPro ... - else() - # Others: GNU, Clang and variants + else() + # Others: GNU, Clang and variants - # Set -march flag - foreach(_flag ${_march_flag_list}) - AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) - break() - endif(_good) - endforeach(_flag) + # Set -march flag + foreach(_flag ${_march_flag_list}) + AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + if(_good) + break() + endif(_good) + endforeach(_flag) - # Set -m flag for enabled features - foreach(_flag ${_enable_extension_list}) - AddCompilerFlag("-m${_flag}" CXX_RESULT _result) - if(_result) - set(_header FALSE) - if(_flag STREQUAL "sse3") - set(_header "pmmintrin.h") - elseif(_flag STREQUAL "ssse3") - set(_header "tmmintrin.h") - elseif(_flag STREQUAL "sse4.1") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4.2") - set(_header "nmmintrin.h") - elseif(_flag STREQUAL "sse4a") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "avx") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx2") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx512*") - set(_header "immintrin.h") - elseif(_flag STREQUAL "fma4") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "xop") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "bmi") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "bmi2") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "rdrnd") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdpid") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdseed") - set(_header "immintrin.h") - endif() - set(_resultVar "HAVE_${_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - if(_header) - CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "disabling ${_useVar} because ${_header} is missing") - set(${_useVar} FALSE) - list(APPEND _disable_extension_list "${_flag}") + # Set -m flag for enabled features + foreach(_flag ${_enable_extension_list}) + AddCompilerFlag("-m${_flag}" CXX_RESULT _result) + if(_result) + set(_header FALSE) + if(_flag STREQUAL "sse3") + set(_header "pmmintrin.h") + elseif(_flag STREQUAL "ssse3") + set(_header "tmmintrin.h") + elseif(_flag STREQUAL "sse4.1") + set(_header "smmintrin.h") + elseif(_flag STREQUAL "sse4.2") + set(_header "nmmintrin.h") + elseif(_flag STREQUAL "sse4a") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "avx") + set(_header "immintrin.h") + elseif(_flag STREQUAL "avx2") + set(_header "immintrin.h") + elseif(_flag STREQUAL "avx512*") + set(_header "immintrin.h") + elseif(_flag STREQUAL "fma4") + set(_header "x86intrin.h") + elseif(_flag STREQUAL "xop") + set(_header "x86intrin.h") + elseif(_flag STREQUAL "bmi") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "bmi2") + set(_header "ammintrin.h") + elseif(_flag STREQUAL "rdrnd") + set(_header "immintrin.h") + elseif(_flag STREQUAL "rdpid") + set(_header "immintrin.h") + elseif(_flag STREQUAL "rdseed") + set(_header "immintrin.h") + endif() + set(_resultVar "HAVE_${_header}") + string(REPLACE "." "_" _resultVar "${_resultVar}") + if(_header) + CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") + if(NOT ${_resultVar}) + set(_useVar "USE_${_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + message(STATUS "disabling ${_useVar} because ${_header} is missing") + set(${_useVar} FALSE) + list(APPEND _disable_extension_list "${_flag}") + endif() + endif() + if(NOT _header OR ${_resultVar}) + list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") endif() endif() - if(NOT _header OR ${_resultVar}) - list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") - endif() - endif() - endforeach(_flag) + endforeach(_flag) - # Set -mno-feature flag for disabled features - foreach(_flag ${_disable_extension_list}) - AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) + # Set -mno-feature flag for disabled features + foreach(_flag ${_disable_extension_list}) + AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + endif() endif() - endif() + endif() + # Compile code with profiling instrumentation if(TARGET_PROFILER STREQUAL "gprof") AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) @@ -795,6 +654,8 @@ endif(FALSE) endif() endif() + list(REMOVE_DUPLICATES OFA_ARCHITECTURE_FLAGS) + if(OFA_VERBOSE) string(REPLACE ";" ", " _str "${OFA_ARCHITECTURE_FLAGS}") message(STATUS "OFA_ARCHITECTURE_FLAGS: " ${_str}) From ee1e2871b50fc50577ab9a2fb1f02fea0381dcb5 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 19 Jan 2022 14:53:19 +0100 Subject: [PATCH 141/174] Update OFA - x86 completed --- cmake/OptimizeForArchitecture.cmake | 13 +- cmake/ofa/AddCXXCompilerFlag.cmake | 199 +++++++++++++++++++++++ cmake/ofa/CheckCXXCompilerFlag.cmake | 2 +- cmake/ofa/CommonMacros.cmake | 8 + cmake/ofa/HandleArmOptions.cmake | 7 +- cmake/ofa/HandlePpcOptions.cmake | 3 +- cmake/ofa/HandleX86Options.cmake | 227 +++++++++++---------------- 7 files changed, 306 insertions(+), 153 deletions(-) create mode 100644 cmake/ofa/AddCXXCompilerFlag.cmake create mode 100644 cmake/ofa/CommonMacros.cmake diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 0196db4fad..6c070fae14 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -59,17 +59,9 @@ # - Update of CPUIDs for latest Intel and AMD processors # - Added support for PPC64 (Clang, GCC, IBM XLC) # - Added Support for ARM (Clang, GCC, ARM Clang, Cray, Fujitsu) +# - Restructuring and splitting into multiple files #============================================================================= -macro(_my_find _list _value _ret) - list(FIND ${_list} "${_value}" _found) - if(_found EQUAL -1) - set(${_ret} FALSE) - else() - set(${_ret} TRUE) - endif() -endmacro(_my_find) - #============================================================================= # Autodetection of CPU #============================================================================= @@ -102,7 +94,6 @@ include(ofa/HandleArmOptions) include(ofa/HandlePpcOptions) macro(OptimizeForArchitecture) - message(STATUS "Optimizing for target architecture") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used. Setting the value to \"auto\" will try to optimize for the architecture where cmake is called. Setting the value to \"native\" bypasses all checks and uses \"-march=native\" or the compiler equivalent flag. Other supported values are: \"none\", \"generic\", \"core\", \"core2\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandybridge\", \"ivybridge\", \"haswell\", \"broadwell\", \"skylake\", \"skylake-xeon\", \"kabylake\", \"cannonlake\", \"cascadelake\", \"cooperlake\", \"icelake\", \"icelake-xeon\", \"tigerlake\", \"alderlake\", \"sapphirerapids\", \"bonnell\", \"silvermont\", \"goldmont\", \"goldmont-plus\", \"tremont\", \"knl\" (Knights Landing), \"knm\" (Knights Mill), \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\", \"piledriver\", \"steamroller\", \"excavator\", \"amd14h\", \"amd16h\", \"zen\", \"zen2\", \"zen3\"." ) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") @@ -138,6 +129,4 @@ macro(OptimizeForArchitecture) elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(powerpc|ppc)64.*") OFA_HandlePpcOptions() endif() - - message(STATUS "Optimizing for target architecture - done") endmacro(OptimizeForArchitecture) diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake new file mode 100644 index 0000000000..74417f3e4c --- /dev/null +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -0,0 +1,199 @@ +# Add a given compiler flag to flag variables. +# +# Usage: +# AddCXXCompilerFlag( +# [CODE ] +# [EXTRA_FLAGS ] +# [FLAGS ] +# [HEADERS ] +# [RESULT ] +# [TESTS ]) +# +# Input argument: +# flag to be added after succesful completion of all tests +# +# Optional input arguments: +# CODE variable holding the test code; this overrides the +# automatic generation of the test code +# EXTRA_FLAGS variable holding the list of extra compiler flags that +# are used without checks +# FLAGS variable holding the list of flags to which is +# added after succesful completion of all tests +# HEADERS variable holding the list of header files prepended to +# the C++ test code's main function +# TESTS variable holding the list of tests to be included in +# the C++ test code's main function body +# +# Output argument: +# RESULT variable holding the result of all tests + +#============================================================================= +# This code is largely inspired by +# +# AddCompilerFlag.cmake +# Copyright 2010-2015 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the names of contributing organizations nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# and +# +# CheckCXXCompilerFlag.cmake +# Copyright 2006-2009 Kitware, Inc. +# Copyright 2006 Alexander Neundorf +# Copyright 2011-2013 Matthias Kretz +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * The names of Kitware, Inc., the Insight Consortium, or the names of +# any consortium members, or of any contributors, may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================= + +macro(AddCXXCompilerFlag _flag) + set(state 0) + unset(_code) + unset(_extra_flags) + unset(_flags) + unset(_headers) + unset(_result) + unset(_tests) + + foreach(_arg ${ARGN}) + if("x${_arg}" STREQUAL "xCODE") + set(state 1) + elseif("x${_arg}" STREQUAL "xEXTRA_FLAGS") + set(state 2) + elseif("x${_arg}" STREQUAL "xFLAGS") + set(state 3) + elseif("x${_arg}" STREQUAL "xHEADERS") + set(state 4) + elseif("x${_arg}" STREQUAL "xRESULT") + set(state 5) + elseif("x${_arg}" STREQUAL "xTESTS") + set(state 6) + + elseif(state EQUAL 1) + set(_code ${_arg}) + elseif(state EQUAL 2) + set(_extra_flags ${_arg}) + elseif(state EQUAL 3) + set(_flags ${_arg}) + elseif(state EQUAL 4) + set(_headers ${_arg}) + elseif(state EQUAL 5) + set(_result ${_arg}) + elseif(state EQUAL 6) + set(_tests ${_arg}) + else() + message(FATAL_ERROR "The argument ${_arg} is not supported by AddCXXCompilerFlag") + endif() + endforeach() + + set(_check_include_file_cxx true) + set(_check_cxx_source_compiles true) + + # Check availability of header file(s) + foreach(_header ${_headers}) + set(_resultVar "check_include_file_cxx_${_header}") + string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") + check_include_file_cxx(${_header} ${_resultVar} "${_flag} ${_extra_flags}") + + if(NOT ${_resultVar}) + set(_check_include_file_cxx false) + endif() + endforeach() + + # Check if compiler supports flag and can compile code + set(_cxx_code) + foreach(_header ${_headers}) + set(_cxx_code "${_cxx_code}\n#include<${_header}>") + endforeach() + + if(_code) + set(_cxx_code "${_cxx_code}\n${_code}") + elseif(_tests) + set(_cxx_code "${_cxx_code}\nint main() {") + foreach(_test ${_tests}) + set(_cxx_code "${_cxx_code}\n${_test}") + endforeach() + set(_cxx_code "${_cxx_code}\nreturn 0; }") + else() + set(_cxx_code "${_cxx_code}\nint main() { return 0; }") + endif() + string(MD5 _hash "${_cxx_code}") + + set(_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "${_flag} ${_extra_flags}") + set(_resultVar "check_cxx_source_compiles_${_flag}_${_hash}") + string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") + check_cxx_source_compiles("${_cxx_code}" ${_resultVar} + # Some compilers do not fail with a bad flag + FAIL_REGEX "error: bad value (.*) for .* switch" # GNU + FAIL_REGEX "argument unused during compilation" # clang + FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU + FAIL_REGEX "unrecognized .*option" # GNU + FAIL_REGEX "ignored for target" # GNU + FAIL_REGEX "ignoring unknown option" # MSVC + FAIL_REGEX "warning D9002" # MSVC + FAIL_REGEX "[Uu]nknown option" # HP + FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro + FAIL_REGEX "command option .* is not recognized" # XL + FAIL_REGEX "WARNING: unknown flag:" # Open64 + FAIL_REGEX "command line error" # ICC + FAIL_REGEX "command line warning" # ICC + FAIL_REGEX "#10236:" # ICC: File not found + FAIL_REGEX " #10159: " # ICC + FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' + ) + set(CMAKE_REQUIRED_FLAGS "${_CMAKE_REQUIRED_FLAGS}") + + if(NOT ${_resultVar}) + set(_check_cxx_source_compiles false) + endif() + + if (DEFINED _result) + if (${_check_include_file_cxx} AND ${_check_cxx_source_compiles}) + set(${_result} true) + else() + set(${_result} false) + endif() + endif() + + if(DEFINED _flags AND ${_check_include_file_cxx} AND ${_check_cxx_source_compiles}) + list(APPEND ${_flags} "${_flag}") + endif() +endmacro(AddCXXCompilerFlag) diff --git a/cmake/ofa/CheckCXXCompilerFlag.cmake b/cmake/ofa/CheckCXXCompilerFlag.cmake index 5628e984f0..1df1559700 100644 --- a/cmake/ofa/CheckCXXCompilerFlag.cmake +++ b/cmake/ofa/CheckCXXCompilerFlag.cmake @@ -48,7 +48,7 @@ MACRO (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT) SET(TEST_SOURCE "${ARGV2}") else() SET(TEST_SOURCE "int main() { return 0; }") - endif() + endif() CHECK_CXX_SOURCE_COMPILES("${TEST_SOURCE}" ${_RESULT} # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU diff --git a/cmake/ofa/CommonMacros.cmake b/cmake/ofa/CommonMacros.cmake new file mode 100644 index 0000000000..72259c73e1 --- /dev/null +++ b/cmake/ofa/CommonMacros.cmake @@ -0,0 +1,8 @@ +macro(_ofa_find _list _value _ret) + list(FIND ${_list} "${_value}" _found) + if(_found EQUAL -1) + set(${_ret} FALSE) + else() + set(${_ret} TRUE) + endif() +endmacro(_ofa_find) diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index a29280c476..6daab85fc9 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -13,6 +13,7 @@ #============================================================================= include(ofa/AddCompilerFlag) +include(ofa/CommonMacros) include(CheckIncludeFileCXX) macro(OFA_HandleArmOptions) @@ -790,7 +791,7 @@ macro(OFA_HandleArmOptions) if(_broken) set(_found false) else() - _my_find(_available_vector_units_list "${_flag}" _found) + _ofa_find(_available_vector_units_list "${_flag}" _found) endif() set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) mark_as_advanced(USE_${_name}) @@ -842,12 +843,12 @@ macro(OFA_HandleArmOptions) # Add compiler flags if(MSVC AND MSVC_VERSION GREATER 1900) - _my_find(_enable_vector_unit_list "vfpv4" _found) + _ofa_find(_enable_vector_unit_list "vfpv4" _found) if(_found) AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() if(NOT _found) - _my_find(_enable_vector_unit_list "simd" _found) + _ofa_find(_enable_vector_unit_list "simd" _found) if(_found) AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() diff --git a/cmake/ofa/HandlePpcOptions.cmake b/cmake/ofa/HandlePpcOptions.cmake index eb5b2d9c2c..19b46f3b38 100644 --- a/cmake/ofa/HandlePpcOptions.cmake +++ b/cmake/ofa/HandlePpcOptions.cmake @@ -13,6 +13,7 @@ #============================================================================= include(ofa/AddCompilerFlag) +include(ofa/CommonMacros) include(CheckIncludeFileCXX) macro(OFA_HandlePpcOptions) @@ -112,7 +113,7 @@ macro(OFA_HandlePpcOptions) if(_broken) set(_found false) else() - _my_find(_available_vector_units_list "${_flag}" _found) + _ofa_find(_available_vector_units_list "${_flag}" _found) endif() set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) mark_as_advanced(USE_${_name}) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index 02892987a7..69820e4fe3 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -1,18 +1,17 @@ #============================================================================= # Handling of X86 / X86_64 options # -# This is a two-step process: +# This is a three-step process: # -# 1. Generate a list of compiler flags for the specific CPU +# 1. Generate a list of available compiler flags for the specific CPU # -# 2. Apply compiler flags +# 2. Enable/disable features based on compiler/linker capabilities # -# 2.1 Enable/disable features based on compiler capabilities -# -# 4. Set compiler-specific flags +# 3. Set compiler-specific flags (-m/-mno-) #============================================================================= -include(ofa/AddCompilerFlag) +include(ofa/AddCXXCompilerFlag) +include(ofa/CommonMacros) include(CheckIncludeFileCXX) macro(OFA_HandleX86Options) @@ -26,26 +25,26 @@ macro(OFA_HandleX86Options) OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") if(WIN32) # Intel (on Windows) - AddCompilerFlag("/QxHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("/QxHOST" FLAGS OFA_ARCHITECTURE_FLAGS) else() # Intel (on Linux) - AddCompilerFlag("-xHOST" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-xHOST" FLAGS OFA_ARCHITECTURE_FLAGS) endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") # NVidia HPC / PGI (on Linux/Windows - AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-tp=native" FLAGS OFA_ARCHITECTURE_FLAGS) elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") # Sun/Oracle Studio (on Linux/Sun OS) - AddCompilerFlag("-native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-native" FLAGS OFA_ARCHITECTURE_FLAGS) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") # Cray (on Linux) message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") else() # Others: GNU, Clang and variants - _my_find(OFA_ARCHITECTURE_FLAGS "-march=native" _found) + _ofa_find(OFA_ARCHITECTURE_FLAGS "-march=native" _found) if(NOT _found) - AddCompilerFlag("-march=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-march=native" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() @@ -60,7 +59,7 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "nehalem") list(APPEND _march_flag_list "corei7") list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "popcnt") endmacro() macro(_westmere) list(APPEND _march_flag_list "westmere") @@ -184,11 +183,11 @@ macro(OFA_HandleX86Options) elseif(TARGET_ARCHITECTURE STREQUAL "merom") list(APPEND _march_flag_list "merom") list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3") elseif(TARGET_ARCHITECTURE STREQUAL "penryn") list(APPEND _march_flag_list "penryn") list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") + list(APPEND _available_extension_list "mmx" "sse" "sse2" "sse3" "ssse3") message(STATUS "[OFA] Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.") if(_cpu_flags MATCHES "sse4_1") message(STATUS "[OFA] SSE4.1: enabled (auto-detected from this computer's CPU flags)") @@ -335,6 +334,7 @@ macro(OFA_HandleX86Options) # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") list(APPEND _march_flag_list "generic") + list(APPEND _available_extension_list "sse") elseif(TARGET_ARCHITECTURE STREQUAL "none") # add this clause to remove it from the else clause @@ -346,12 +346,16 @@ macro(OFA_HandleX86Options) list(REMOVE_DUPLICATES _available_extension_list) if(OFA_VERBOSE) - string(REPLACE ";" ", " _str "${_march_flag_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] CPU architectures: " ${_str}) - string(REPLACE ";" ", " _str "${_available_extension_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (available): ${_str}") + if(_march_flag_list) + string(REPLACE ";" ", " _str "${_march_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU architectures: " ${_str}) + endif() + if(_available_extension_list) + string(REPLACE ";" ", " _str "${_available_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (available): ${_str}") + endif() endif() # Step 2: Apply compiler flags @@ -364,6 +368,7 @@ macro(OFA_HandleX86Options) file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/IntrinsicsX86.txt _intrinsics) string(REPLACE ";" "|" _intrinsics "${_intrinsics}") string(REPLACE "\n" ";" _intrinsics "${_intrinsics}") + foreach (_intrinsic ${_intrinsics}) if ("${_intrinsic}" MATCHES "^#" ) # Skip comment continue() @@ -378,45 +383,29 @@ macro(OFA_HandleX86Options) list(GET _intrinsic_flags 0 _flag) string(REPLACE ";" " -m" _intrinsic_flags "-m${_intrinsic_flags}") list(APPEND _check_extension_list "${_flag}") - - # Check if include file is available - set(_resultVar "HAVE_${_intrinsic_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - check_include_file_cxx( - ${_intrinsic_header} - ${_resultVar} - ${_intrinsic_flags}) - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "[OFA] Disabling ${_useVar} because ${_intrinsic_header} is missing") - set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) - mark_as_advanced(${_useVar}) - continue() + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + set(_code "#define __int32 long\n#define __int64 long long\nint main() { ${_intrinsic_name}(${_intrinsic_params})\; return 0\; }") + else() + set(_code "int main() { ${_intrinsic_name}(${_intrinsic_params})\; return 0\; }") endif() + + AddCXXCompilerFlag("-m${_flag}" + EXTRA_FLAGS ${_intrinsic_flags} + HEADERS ${_intrinsic_header} + CODE "${_code}" + RESULT _ok) - # Check if compiler supports flag and can compile code - set(_resultVar "HAVE_${_flag}_${_intrinsic_name}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - set(CMAKE_REQUIRED_FLAGS "${_intrinsic_flags}") - check_cxx_source_compiles( - "#include<${_intrinsic_header}> - int main() { - ${_intrinsic_name}(${_intrinsic_params}); - return 0; - }" - ${_resultVar}) - unset(CMAKE_REQUIRED_FLAGS) set(_useVar "USE_${_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") - if (NOT ${_resultVar}) - message(STATUS "[OFA] Disabling ${_useVar} because -m${_flag} is not supported by compiler and/or ${_intrinsic_name} intrinsics fails to compile") - set(${_useVar} false CACHE BOOL "Use ${_flag} extension." FORCE) + + if(NOT ${_ok}) + set(${_useVar} false CACHE BOOL "Use ${_flag} extension.") mark_as_advanced(${_useVar}) else() - set(${_useVar} true CACHE BOOL "Use ${_flag} extension.") + _ofa_find(_available_extension_list "${_flag}" _found) + set(${_useVar} ${_found} CACHE BOOL "Use ${_flag} extension.") mark_as_advanced(${_useVar}) endif() endforeach() @@ -424,11 +413,11 @@ macro(OFA_HandleX86Options) # Generate lists of enabled/disabled flags list(REMOVE_DUPLICATES _check_extension_list) foreach(_flag ${_check_extension_list}) - _my_find(_available_extension_list "${_flag}" _found) + _ofa_find(_available_extension_list "${_flag}" _found) set(_useVar "USE_${_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") - if(_found AND ${_useVar}) + if(${_useVar}) list(APPEND _enable_extension_list "${_flag}") else() list(APPEND _disable_extension_list "${_flag}") @@ -436,42 +425,48 @@ macro(OFA_HandleX86Options) endforeach() if(OFA_VERBOSE) - string(REPLACE ";" ", " _str "${_enable_extension_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (enabled): ${_str}") - string(REPLACE ";" ", " _str "${_disable_extension_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (disabled): ${_str}") + if(_enable_extension_list) + list(SORT _enable_extension_list) + string(REPLACE ";" ", " _str "${_enable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (enabled): ${_str}") + endif() + if(_disable_extension_list) + list(SORT _disable_extension_list) + string(REPLACE ";" ", " _str "${_disable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (disabled): ${_str}") + endif() endif() # Add compiler flags if(MSVC AND MSVC_VERSION GREATER 1700) - _my_find(_enable_extension_list "avx512f" _found) + _ofa_find(_enable_extension_list "avx512f" _found) if(_found) - AddCompilerFlag("/arch:AVX512" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + AddCXXCompilerFlag("/arch:AVX512" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() if(NOT _found) - _my_find(_enable_extension_list "avx2" _found) + _ofa_find(_enable_extension_list "avx2" _found) if(_found) - AddCompilerFlag("/arch:AVX2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + AddCXXCompilerFlag("/arch:AVX2" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() endif() if(NOT _found) - _my_find(_enable_extension_list "avx" _found) + _ofa_find(_enable_extension_list "avx" _found) if(_found) - AddCompilerFlag("/arch:AVX" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + AddCXXCompilerFlag("/arch:AVX" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() endif() if(NOT _found) - _my_find(_enable_extension_list "sse2" _found) + _ofa_find(_enable_extension_list "sse2" _found) if(_found) - AddCompilerFlag("/arch:SSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("/arch:SSE2" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() if(NOT _found) - _my_find(_enable_extension_list "sse" _found) + _ofa_find(_enable_extension_list "sse" _found) if(_found) - AddCompilerFlag("/arch:SSE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() foreach(_flag ${_enable_extension_list}) @@ -541,7 +536,7 @@ macro(OFA_HandleX86Options) foreach(_arch ${_march_flag_list}) if(DEFINED OFA_map_${_arch}) foreach(_flag ${OFA_map_${_arch}}) - AddCompilerFlag(${_flag} CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _ok) + AddCXXCompilerFlag(${_flag} FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) if(_ok) break() endif() @@ -555,12 +550,22 @@ macro(OFA_HandleX86Options) # This is the Intel compiler, so SSE2 is a very reasonable baseline. message(STATUS "[OFA] Did not recognize the requested architecture flag ${_arch}, falling back to SSE2") if(WIN32) - AddCompilerFlag("-QxSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-QxSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) else() - AddCompilerFlag("-xSSE2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-xSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() + # Set -m flag for enabled features + foreach(_flag ${_enable_extension_list}) + AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + + # Set -mno-feature flag for disabled features + foreach(_flag ${_disable_extension_list}) + AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + # TODO PGI/Cray/SunPro ... else() @@ -568,7 +573,7 @@ macro(OFA_HandleX86Options) # Set -march flag foreach(_flag ${_march_flag_list}) - AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-march=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) if(_good) break() endif(_good) @@ -576,62 +581,12 @@ macro(OFA_HandleX86Options) # Set -m flag for enabled features foreach(_flag ${_enable_extension_list}) - AddCompilerFlag("-m${_flag}" CXX_RESULT _result) - if(_result) - set(_header FALSE) - if(_flag STREQUAL "sse3") - set(_header "pmmintrin.h") - elseif(_flag STREQUAL "ssse3") - set(_header "tmmintrin.h") - elseif(_flag STREQUAL "sse4.1") - set(_header "smmintrin.h") - elseif(_flag STREQUAL "sse4.2") - set(_header "nmmintrin.h") - elseif(_flag STREQUAL "sse4a") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "avx") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx2") - set(_header "immintrin.h") - elseif(_flag STREQUAL "avx512*") - set(_header "immintrin.h") - elseif(_flag STREQUAL "fma4") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "xop") - set(_header "x86intrin.h") - elseif(_flag STREQUAL "bmi") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "bmi2") - set(_header "ammintrin.h") - elseif(_flag STREQUAL "rdrnd") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdpid") - set(_header "immintrin.h") - elseif(_flag STREQUAL "rdseed") - set(_header "immintrin.h") - endif() - set(_resultVar "HAVE_${_header}") - string(REPLACE "." "_" _resultVar "${_resultVar}") - if(_header) - CHECK_INCLUDE_FILE_CXX("${_header}" ${_resultVar} "-m${_flag}") - if(NOT ${_resultVar}) - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - message(STATUS "disabling ${_useVar} because ${_header} is missing") - set(${_useVar} FALSE) - list(APPEND _disable_extension_list "${_flag}") - endif() - endif() - if(NOT _header OR ${_resultVar}) - list(APPEND OFA_ARCHITECTURE_FLAGS "-m${_flag}") - endif() - endif() + AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_flag) # Set -mno-feature flag for disabled features foreach(_flag ${_disable_extension_list}) - AddCompilerFlag("-mno-${_flag}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_flag) endif() endif() @@ -640,17 +595,17 @@ macro(OFA_HandleX86Options) # Compile code with profiling instrumentation if(TARGET_PROFILER STREQUAL "gprof") - AddCompilerFlag("-pg" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-pg" FLAGS OFA_ARCHITECTURE_FLAGS) elseif(TARGET_PROFILER STREQUAL "vtune") if (CMAKE_CXX_COMPILER_ID MATCHES "Intel") # Need to check if this also works on Windows - AddCompilerFlag("-g" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-debug inline-debug-info" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-D TBB_USE_THREADING_TOOLS" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-parallel-source-info=2" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-gline-tables-only" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-fdebug-info-for-profiling" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - AddCompilerFlag("-Xsprofile" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-g" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-debug inline-debug-info" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-D TBB_USE_THREADING_TOOLS" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-parallel-source-info=2" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-gline-tables-only" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-fdebug-info-for-profiling" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-Xsprofile" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() From 5c5259d796278d0ae96b71795bfc7bac7f2a2912 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Wed, 19 Jan 2022 19:29:30 +0100 Subject: [PATCH 142/174] Update OFA - x86 completed --- cmake/OptimizeForArchitecture.cmake | 6 + cmake/ofa/AddCXXCompilerFlag.cmake | 17 +- .../ofa/{IntrinsicsX86.txt => ChecksX86.txt} | 18 +- cmake/ofa/HandleX86Options.cmake | 164 +++++++++++------- 4 files changed, 130 insertions(+), 75 deletions(-) rename cmake/ofa/{IntrinsicsX86.txt => ChecksX86.txt} (93%) diff --git a/cmake/OptimizeForArchitecture.cmake b/cmake/OptimizeForArchitecture.cmake index 6c070fae14..c0b7941d8f 100644 --- a/cmake/OptimizeForArchitecture.cmake +++ b/cmake/OptimizeForArchitecture.cmake @@ -16,6 +16,7 @@ # Optional inputs: # TARGET_ARCHITECTURE= specifies the target architecture (default=auto) # TARGET_PROFILER= specifies the target profiler (default=none) +# OFA_VERBOSE= prints verbose output (default=off) # # If any of the _broken flags are defined and set to true, # the OptimizeForArchitecture macro will consequently disable the @@ -23,6 +24,10 @@ # # Output: # OFA_ARCHITECTURE_FLAGS compiler flags optimized for the target architecture +# +# Internal variables: +# USE_ boolean variable holding the status of +# HAVE_ boolean variable holding the compiler;s capability #============================================================================= # Copyright 2010-2016 Matthias Kretz @@ -122,6 +127,7 @@ macro(OptimizeForArchitecture) message(STATUS "Detected Host CPU: ${TARGET_ARCHITECTURE}") endif() + message(STATUS "Checking Host CPU features. This can take some time ...") if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "i686.*|i386.*|x86.*|amd64.*|x86_64.*|AMD64.*") OFA_HandleX86Options() elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)") diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake index 74417f3e4c..25b674bf68 100644 --- a/cmake/ofa/AddCXXCompilerFlag.cmake +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -123,17 +123,17 @@ macro(AddCXXCompilerFlag _flag) endif() endforeach() - set(_check_include_file_cxx true) - set(_check_cxx_source_compiles true) + set(_check_include_file_cxx TRUE) + set(_check_cxx_source_compiles TRUE) # Check availability of header file(s) foreach(_header ${_headers}) - set(_resultVar "check_include_file_cxx_${_header}") + set(_resultVar "HAVE_${_header}") string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") check_include_file_cxx(${_header} ${_resultVar} "${_flag} ${_extra_flags}") if(NOT ${_resultVar}) - set(_check_include_file_cxx false) + set(_check_include_file_cxx FALSE) endif() endforeach() @@ -154,11 +154,10 @@ macro(AddCXXCompilerFlag _flag) else() set(_cxx_code "${_cxx_code}\nint main() { return 0; }") endif() - string(MD5 _hash "${_cxx_code}") set(_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") set(CMAKE_REQUIRED_FLAGS "${_flag} ${_extra_flags}") - set(_resultVar "check_cxx_source_compiles_${_flag}_${_hash}") + set(_resultVar "HAVE_${_flag}") string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") check_cxx_source_compiles("${_cxx_code}" ${_resultVar} # Some compilers do not fail with a bad flag @@ -182,14 +181,14 @@ macro(AddCXXCompilerFlag _flag) set(CMAKE_REQUIRED_FLAGS "${_CMAKE_REQUIRED_FLAGS}") if(NOT ${_resultVar}) - set(_check_cxx_source_compiles false) + set(_check_cxx_source_compiles FALSE) endif() if (DEFINED _result) if (${_check_include_file_cxx} AND ${_check_cxx_source_compiles}) - set(${_result} true) + set(${_result} TRUE) else() - set(${_result} false) + set(${_result} FALSE) endif() endif() diff --git a/cmake/ofa/IntrinsicsX86.txt b/cmake/ofa/ChecksX86.txt similarity index 93% rename from cmake/ofa/IntrinsicsX86.txt rename to cmake/ofa/ChecksX86.txt index fa3ee90d0b..5a9f35bd09 100644 --- a/cmake/ofa/IntrinsicsX86.txt +++ b/cmake/ofa/ChecksX86.txt @@ -1,7 +1,23 @@ -# List of x86 intrisics to check +# List of x86 checks # Format: [,];
;; # line starting with # are comments +# For each line of this file, OFA_HandleX86Options generates the code +# +# #include
+# int main { +# name(list of parameters); +# return 0; +# } +# +# and compiles it with +# +# CXX -m -m +# +# or +# +# CXX -m-no -m-no + # MMX mmx;mmintrin.h;_mm_add_pi16;__m64(),__m64() diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index 69820e4fe3..c2daae408a 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -47,13 +47,13 @@ macro(OFA_HandleX86Options) AddCXXCompilerFlag("-march=native" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - + # Step 1: Generate a list of compiler flags for the specific CPU set(_march_flag_list) set(_available_extension_list) - + # Define macros for Intel macro(_nehalem) list(APPEND _march_flag_list "nehalem") @@ -344,7 +344,7 @@ macro(OFA_HandleX86Options) list(SORT _available_extension_list) list(REMOVE_DUPLICATES _available_extension_list) - + if(OFA_VERBOSE) if(_march_flag_list) string(REPLACE ";" ", " _str "${_march_flag_list}") @@ -357,73 +357,106 @@ macro(OFA_HandleX86Options) message(STATUS "[OFA] Extensions (available): ${_str}") endif() endif() - - # Step 2: Apply compiler flags + if(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_check_extension_list) set(_disable_extension_list) set(_enable_extension_list) - # Step 2.1: Enable/disable features based on compiler capabilities - file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/IntrinsicsX86.txt _intrinsics) - string(REPLACE ";" "|" _intrinsics "${_intrinsics}") - string(REPLACE "\n" ";" _intrinsics "${_intrinsics}") - - foreach (_intrinsic ${_intrinsics}) - if ("${_intrinsic}" MATCHES "^#" ) # Skip comment + # Step 2: Enable/disable feature flags based on available CPU + # features, used-defined USE_ variables and + # the capabilities of the host system's compiler and linker + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksX86.txt _checks) + string(REPLACE ";" "|" _checks "${_checks}") + string(REPLACE "\n" ";" _checks "${_checks}") + + # Iterate over the list of checks line by line + foreach (_check ${_checks}) + if ("${_check}" MATCHES "^#" ) # Skip comment continue() endif() - string(REPLACE "|" ";" _intrinsic "${_intrinsic}") - list(GET _intrinsic 0 _intrinsic_flags) - list(GET _intrinsic 1 _intrinsic_header) - list(GET _intrinsic 2 _intrinsic_name) - list(GET _intrinsic 3 _intrinsic_params) - string(REPLACE "," ";" _intrinsic_flags "${_intrinsic_flags}") - list(GET _intrinsic_flags 0 _flag) - string(REPLACE ";" " -m" _intrinsic_flags "-m${_intrinsic_flags}") - list(APPEND _check_extension_list "${_flag}") + # Extract extra CPU extensions, header files, function name, and parameters + string(REPLACE "|" ";" _check "${_check}") + list(GET _check 0 _check_extensions) + list(GET _check 1 _check_headers) + list(GET _check 2 _check_function) + list(GET _check 3 _check_params) + + # Convert list of extensions into compiler flags + string(REPLACE "," ";" _check_extensions "${_check_extensions}") + list(GET _check_extensions 0 _extension) + string(REPLACE ";" " -m" _check_flags "-m${_check_extensions}") + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension> variable + set(_useVar "USE_${_extension}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + + # Set USE_<_extension> if not set externally + if(NOT DEFINED ${_useVar}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(${_useVar} ${_found}) + endif() + # Apply compiler-specific fixes if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") - set(_code "#define __int32 long\n#define __int64 long long\nint main() { ${_intrinsic_name}(${_intrinsic_params})\; return 0\; }") + # GNU GCC does not provide definitions for __int32 and __in64 + set(_code "#define __int32 long\n#define __int64 long long\nint main() { ${_check_function}(${_check_params})\; return 0\; }") else() - set(_code "int main() { ${_intrinsic_name}(${_intrinsic_params})\; return 0\; }") + set(_code "int main() { ${_check_function}(${_check_params})\; return 0\; }") endif() - - AddCXXCompilerFlag("-m${_flag}" - EXTRA_FLAGS ${_intrinsic_flags} - HEADERS ${_intrinsic_header} - CODE "${_code}" - RESULT _ok) - - set(_useVar "USE_${_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - - if(NOT ${_ok}) - set(${_useVar} false CACHE BOOL "Use ${_flag} extension.") - mark_as_advanced(${_useVar}) + + if(${_useVar}) + # Check if the compiler supports the -m<_extension> flag and + # can compile the provided test code with it + AddCXXCompilerFlag("-m${_extension}" + EXTRA_FLAGS ${_check_flags} + HEADERS ${_check_headers} + CODE "${_code}" + RESULT _ok) + if(NOT ${_ok}) + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + else() + set(${_useVar} ${${_useVar}} CACHE BOOL "Use ${_extension} extension.") + endif() else() - _ofa_find(_available_extension_list "${_flag}" _found) - set(${_useVar} ${_found} CACHE BOOL "Use ${_flag} extension.") - mark_as_advanced(${_useVar}) + set(${_useVar} ${${_useVar}} CACHE BOOL "Use ${_extension} extension.") endif() + mark_as_advanced(${_useVar}) endforeach() # Generate lists of enabled/disabled flags list(REMOVE_DUPLICATES _check_extension_list) - foreach(_flag ${_check_extension_list}) - _ofa_find(_available_extension_list "${_flag}" _found) - set(_useVar "USE_${_flag}") + foreach(_extension ${_check_extension_list}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(_useVar "USE_${_extension}") string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") + string(REPLACE "." "_" _useVar "${_useVar}") if(${_useVar}) - list(APPEND _enable_extension_list "${_flag}") + set(_haveVar "HAVE__m${_extension}") + string(REPLACE "." "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring -m${_extension} extension because checks failed") + endif() + continue() + endif() + list(APPEND _enable_extension_list "${_extension}") else() - list(APPEND _disable_extension_list "${_flag}") + set(_haveVar "HAVE__mno_${_extension}") + string(REPLACE "." "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring -mno-${_extension} extension because checks failed") + endif() + continue() + endif() + list(APPEND _disable_extension_list "${_extension}") endif() endforeach() - + if(OFA_VERBOSE) if(_enable_extension_list) list(SORT _enable_extension_list) @@ -438,7 +471,7 @@ macro(OFA_HandleX86Options) message(STATUS "[OFA] Extensions (disabled): ${_str}") endif() endif() - + # Add compiler flags if(MSVC AND MSVC_VERSION GREATER 1700) _ofa_find(_enable_extension_list "avx512f" _found) @@ -469,14 +502,15 @@ macro(OFA_HandleX86Options) AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - foreach(_flag ${_enable_extension_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) + foreach(_extension ${_enable_extension_list}) + string(TOUPPER "${_extension}" _extension) + string(REPLACE "." "_" _extension "__${_extension}__") + add_definitions("-D${_extension}") + endforeach(_extension) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + if(WIN32) # Intel (on Windows) set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") @@ -556,16 +590,16 @@ macro(OFA_HandleX86Options) endif() endif() - # Set -m flag for enabled features - foreach(_flag ${_enable_extension_list}) - AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) + # Set -m<_extension> flag for enabled features + foreach(_extension ${_enable_extension_list}) + AddCXXCompilerFlag("-m${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_extension) + + # Set -mno-<_extension> flag for disabled features + foreach(_extension ${_disable_extension_list}) + AddCXXCompilerFlag("-mno-${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_extension) - # Set -mno-feature flag for disabled features - foreach(_flag ${_disable_extension_list}) - AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - # TODO PGI/Cray/SunPro ... else() @@ -592,7 +626,7 @@ macro(OFA_HandleX86Options) endif() endif() - + # Compile code with profiling instrumentation if(TARGET_PROFILER STREQUAL "gprof") AddCXXCompilerFlag("-pg" FLAGS OFA_ARCHITECTURE_FLAGS) @@ -610,7 +644,7 @@ macro(OFA_HandleX86Options) endif() list(REMOVE_DUPLICATES OFA_ARCHITECTURE_FLAGS) - + if(OFA_VERBOSE) string(REPLACE ";" ", " _str "${OFA_ARCHITECTURE_FLAGS}") message(STATUS "OFA_ARCHITECTURE_FLAGS: " ${_str}) From 1cc29b0a6379d2506639467e45c16700016d55db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 19 Jan 2022 20:38:50 +0100 Subject: [PATCH 143/174] Updated Gitlab CI --- .gitlab-ci.yml | 241 +++++++++++++++++++++++++++---------------------- 1 file changed, 132 insertions(+), 109 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a2ac6682ea..db85b11aef 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,11 @@ # 3. Building and extensive testing of commits to branch 'ci_test' [to be added] # # 4. Coverity scan of commits to branch 'coverity_scan' [to be added] +# +# 5. Test installation and deployment +# +# 6. Building docker images for commits to 'stable' branch and upload +# to https://hub.docker.com/r/mmoelle1/gismo ################################################################################ ################################################################################ @@ -22,135 +27,148 @@ # 'coverity_scan' and sending of the ctest results to the CDASH server ################################################################################ -#------------------------------------------------------------------------------- -# Clang 7-11, C++11,14,17,20 -#------------------------------------------------------------------------------- - -# Clang 8, C++11, Release -linux_x86_64_clang8_cxx11_release_double_int32t: +.test:linux:base: tags: - linux stage: test - image: silkeh/clang:8 + image: $IMAGE + variables: + BUILD_TYPE: "Release" + GENERATOR: "Ninja" + GISMO_SUBMODULES: "''" + LABELS_FOR_SUBPROJECTS: "'gismo;examples;unittests;doc-snippets'" script: - apt-get update -y - apt-get install cmake ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=ON -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_ONURBS=ON' -Q + - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=ON -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES="$GISMO_SUBMODULES" -D LABELS_FOR_SUBPROJECTS="$LABELS_FOR_SUBPROJECTS" -D CMAKE_ARGS="$CMAKE_ARGS" -Q except: - external_pull_requests - ci_test - coverity_scan -# Clang 9, C++14, Release -linux_x86_64_clang9_cxx14_release_longdouble_int64t: - tags: - - linux - stage: test - image: silkeh/clang:9 - script: - - apt-get update -y - - apt-get install cmake -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=14;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=long double;-DGISMO_INDEX_TYPE=int64_t;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +#------------------------------------------------------------------------------- +# Clang 7-11, C++11,14,17,20 +#------------------------------------------------------------------------------- + +# Clang 8, C++11, Release +linux_x86_64_clang8_cxx11_release_double_int32t: + extends: .test:linux:base + variables: + IMAGE: "silkeh/clang:8" + CNAME: "/usr/local/bin/clang" + CXXNAME: "/usr/local/bin/clang++" + CMAKE_ARGS: "'-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_ONURBS=ON'" -# Clang 10, C++17, Release -linux_x86_64_clang10_cxx17_release_mpreal_long: - tags: - - linux - stage: test - image: silkeh/clang:10 - script: - - apt-get update -y - - apt-get install cmake libmpfr-dev ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# # Clang 9, C++14, Release +# linux_x86_64_clang9_cxx14_release_longdouble_int64t: +# tags: +# - linux +# stage: test +# image: silkeh/clang:9 +# script: +# - apt-get update -y +# - apt-get install cmake -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=14;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=long double;-DGISMO_INDEX_TYPE=int64_t;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -# Clang 11, C++20, Release -linux_x86_64_clang11_cxx20_release_mpq_long: - tags: - - linux - stage: test - image: silkeh/clang:11 - script: - - apt-get update -y - - apt-get install cmake libgmp-dev -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpq_class;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# # Clang 10, C++17, Release +# linux_x86_64_clang10_cxx17_release_mpreal_long: +# tags: +# - linux +# stage: test +# image: silkeh/clang:10 +# script: +# - apt-get update -y +# - apt-get install cmake libmpfr-dev ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan +# # Clang 11, C++20, Release +# linux_x86_64_clang11_cxx20_release_mpq_long: +# tags: +# - linux +# stage: test +# image: silkeh/clang:11 +# script: +# - apt-get update -y +# - apt-get install cmake libgmp-dev -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/clang -D CXXNAME=/usr/local/bin/clang++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpq_class;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -#------------------------------------------------------------------------------- -# GCC 6-10, C++11,14,17,20 -#------------------------------------------------------------------------------- + +# #------------------------------------------------------------------------------- +# # GCC 6-10, C++11,14,17,20 +# #------------------------------------------------------------------------------- -# GCC 7, C++11, Release -linux_x86_64_gcc7_cxx11_release_mpreal_long: - tags: - - linux - stage: test - image: gcc:7 - script: - - apt-get update -y - - apt-get install cmake libmpfr-dev -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# # GCC 7, C++11, Release +# linux_x86_64_gcc7_cxx11_release_mpreal_long: +# tags: +# - linux +# stage: test +# image: gcc:7 +# script: +# - apt-get update -y +# - apt-get install cmake libmpfr-dev -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=11;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=mpfr::mpreal;-DGISMO_INDEX_TYPE=long;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -# GCC 8, C++14, Release -linux_x86_64_gcc8_cxx14_release_longdouble_int64t: - tags: - - linux - stage: test - image: gcc:8 - script: - - apt-get update -y - - apt-get install cmake ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=14;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=long double;-DGISMO_INDEX_TYPE=int64_t;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# # GCC 8, C++14, Release +# linux_x86_64_gcc8_cxx14_release_longdouble_int64t: +# tags: +# - linux +# stage: test +# image: gcc:8 +# script: +# - apt-get update -y +# - apt-get install cmake ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=14;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=long double;-DGISMO_INDEX_TYPE=int64_t;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -# GCC 9, C++17, Release -linux_x86_64_gcc9_cxx17_release_double_int32t: +# # GCC 9, C++17, Release +# linux_x86_64_gcc9_cxx17_release_double_int32t: - tags: - - linux - stage: test - image: gcc:9 - script: - - apt-get update -y - - apt-get install cmake -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_OCC=ON;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# tags: +# - linux +# stage: test +# image: gcc:9 +# script: +# - apt-get update -y +# - apt-get install cmake -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR="Unix Makefiles" -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=17;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=double;-DGISMO_INDEX_TYPE=int32_t;-DGISMO_WITH_OCC=ON;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan -# GCC 10, C++20, Release -linux_x86_64_gcc10_cxx20_release_float_int: - tags: - - linux - stage: test - image: gcc:10 - script: - - apt-get update -y - - apt-get install cmake ninja-build -y - - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=float;-DGISMO_INDEX_TYPE=int;-DGISMO_WITH_ONURBS=ON' -Q - except: - - external_pull_requests - - ci_test - - coverity_scan +# # GCC 10, C++20, Release +# linux_x86_64_gcc10_cxx20_release_float_int: +# tags: +# - linux +# stage: test +# image: gcc:10 +# script: +# - apt-get update -y +# - apt-get install cmake ninja-build -y +# - ctest -S /builds/gismo-ci/gismo/cmake/ctest_script.cmake -D CTEST_BUILD_NAME="$CI_JOB_NAME" -D CTEST_SITE="$CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA [gitlab-ci]" -D CTEST_SOURCE_DIRECTORY=/builds/gismo-ci/gismo -D CTEST_CONFIGURATION_TYPE=Release -D UPDATE_REPO=OFF -D CTEST_CMAKE_GENERATOR=Ninja -D CNAME=/usr/local/bin/gcc -D CXXNAME=/usr/local/bin/g++ -D CTEST_TEST_TIMEOUT=150 -D GISMO_SUBMODULES='' -D LABELS_FOR_SUBPROJECTS='gismo;examples;unittests;doc-snippets' -D CMAKE_ARGS='-DCMAKE_CXX_STANDARD=20;-DGISMO_BUILD_UNITTESTS=ON;-DGISMO_COEFF_TYPE=float;-DGISMO_INDEX_TYPE=int;-DGISMO_WITH_ONURBS=ON' -Q +# except: +# - external_pull_requests +# - ci_test +# - coverity_scan ################################################################################ # 2. Building and testing of external pull requests (PRs) @@ -205,3 +223,8 @@ install_and_deploy_linux: only: - stable - external_pull_requests + +################################################################################ +# 6. Building docker images for commits to 'stable' branch and upload +# to https://hub.docker.com/r/mmoelle1/gismo +################################################################################ From 40ce2c9880c1ca8723d0728c13e57de9dafeb05a Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Thu, 20 Jan 2022 16:58:04 +0100 Subject: [PATCH 144/174] Update OFA - x86 completed --- cmake/ofa/AddCXXCompilerFlag.cmake | 6 +- cmake/ofa/ChecksX86.txt | 193 +++++++++++++++++-------- cmake/ofa/HandleX86Options.cmake | 219 ++++++++++++++++++++--------- 3 files changed, 296 insertions(+), 122 deletions(-) diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake index 25b674bf68..1ac2974e5e 100644 --- a/cmake/ofa/AddCXXCompilerFlag.cmake +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -170,6 +170,7 @@ macro(AddCXXCompilerFlag _flag) FAIL_REGEX "warning D9002" # MSVC FAIL_REGEX "[Uu]nknown option" # HP FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro + FAIL_REGEX "[Ww]arning: illegal use of -xarch option" # SunPro FAIL_REGEX "command option .* is not recognized" # XL FAIL_REGEX "WARNING: unknown flag:" # Open64 FAIL_REGEX "command line error" # ICC @@ -177,9 +178,10 @@ macro(AddCXXCompilerFlag _flag) FAIL_REGEX "#10236:" # ICC: File not found FAIL_REGEX " #10159: " # ICC FAIL_REGEX " #10353: " # ICC: option '-mfma' ignored, suggest using '-march=core-avx2' - ) + FAIL_REGEX " #10006: " # ICC: ignoring unknown option '-mavx512fp16' + ) set(CMAKE_REQUIRED_FLAGS "${_CMAKE_REQUIRED_FLAGS}") - + if(NOT ${_resultVar}) set(_check_cxx_source_compiles FALSE) endif() diff --git a/cmake/ofa/ChecksX86.txt b/cmake/ofa/ChecksX86.txt index 5a9f35bd09..9edc1e9a4c 100644 --- a/cmake/ofa/ChecksX86.txt +++ b/cmake/ofa/ChecksX86.txt @@ -1,82 +1,126 @@ # List of x86 checks -# Format: [,];
;; -# line starting with # are comments -# For each line of this file, OFA_HandleX86Options generates the code +# FORMAT: +# [,];;;;[] # -# #include
+# lines starting with # are comments +# lines starting with push_enable: start a block of tests enabled for the given compilers only +# lines starting with pop_enable: ends a block of tests enabled for the given compilers only +# lines starting with push_disable: start a block of tests disabled for the given compilers +# lines starting with pop_disable: ends a block of tests disabled for the given compilers + +# DESCRIPTION: +# For each line of this file, HandleX86Options generates the code snipped +# +# #include +# #include +# ... # int main { -# name(list of parameters); +# name(parameter0, parameter1, ...); # return 0; # } # -# and compiles it with +# and compiles it with, e.g. +# +# gcc -m -m +# +# if the extension should be enabled and +# +# gcc -m-no -m-no # -# CXX -m -m +# if the extension should be disabled. In the above example, the +# compiler name 'gcc' and the flag prefixes '-m' and '-mno-' will be +# set properly by HandleX86Options. # -# or +# EXTENSION ALIAS: +# By default, it is assumed that the name of the extension, e.g., +# avx512f coinsides with the name of the compiler flag to be used to +# enable/disable it, e.g., -mno-avx512f. Some compilers like Oracle's +# SunPro have non-canonical naming conventions, +# cf. https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html. # -# CXX -m-no -m-no +# In this case, the optional parameter can be used +# to specify the name of the extension as reported by the system, +# whereas the compiler-specific extension flag(s) are given in +# and [], respectively. +# +# ENABLING/DISABLING OF CHECKS: +# Checks can be explicitly disabled for particular compilers by placing +# them inside a push_disable/pop_disable block, e.g. +# +# push_disable:SunPro,IntelLLVM +# +# pop_disable:SunPro +# +# Similarly, checks can be explicitly enabled for particular compilers +# by placing them inside a push_disable/pop_disable block, e.g. +# +# push_enable:SunPro +# +# pop_enable:SunPro + +# Oracle's SunPro compiler fails these checks +push_disable:SunPro # MMX mmx;mmintrin.h;_mm_add_pi16;__m64(),__m64() -# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/FMA -sse;xmmintrin.h;_mm_add_ps;__m128(),__m128() -sse2;emmintrin.h;_mm_add_epi16;__m128i(),__m128i() -sse3;pmmintrin.h;_mm_addsub_pd;__m128d(),__m128d() -ssse3;tmmintrin.h;_mm_hadd_epi16;__m128i(),__m128i() -sse4.1;smmintrin.h;_mm_max_epi32;__m128i(),__m128i() -sse4.2;nmmintrin.h;_mm_cmpgt_epi64;__m128i(),__m128i() -sse4a;ammintrin.h;_mm_extract_si64;__m128i(),__m128i() -avx;immintrin.h;_mm256_add_pd;__m256d(),__m256d() -avx2;immintrin.h;_mm256_hadd_epi16;__m256i(),__m256i() -fma;immintrin.h;_mm_fmadd_pd;__m128d(),__m128d(),__m128d() +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +avx;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd() +avx2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256() +fma;immintrin.h;_mm_fmadd_pd;_mm_setzero_pd(),_mm_setzero_pd(),_mm_setzero_pd() +sse2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128() +sse3;pmmintrin.h;_mm_addsub_pd;_mm_setzero_pd(),_mm_setzero_pd() +sse4.1;smmintrin.h;_mm_max_epi32;_mm_setzero_si128(),_mm_setzero_si128() +sse4.2;nmmintrin.h;_mm_cmpgt_epi64;_mm_setzero_si128(),_mm_setzero_si128() +sse4a;ammintrin.h;_mm_extract_si64;_mm_setzero_si128(),_mm_setzero_si128() +sse;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps() +ssse3;tmmintrin.h;_mm_hadd_epi16;_mm_setzero_si128(),_mm_setzero_si128() # AVX-VNNI -avxvnni;immintrin.h;_mm_dpbusd_avx_epi32;__m128i(),__m128i(),__m128i() +avxvnni;immintrin.h;_mm_dpbusd_avx_epi32;_mm_setzero_si128(),_mm_setzero_si128(),_mm_setzero_si128() # AVX-512 -avx512f;immintrin.h;_mm512_abs_epi32;__m512i() -avx512bw;immintrin.h;_mm512_abs_epi16;__m512i() +avx5124fmaps;immintrin.h;_mm_4fmadd_ss;_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),_mm_setzero_ps(),new __m128[1] +avx5124vnniw;immintrin.h;_mm512_4dpwssd_epi32;_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512(),new __m128i[1] +avx512bf16,avx512vl;immintrin.h;_mm_cvtne2ps_pbh;_mm_setzero_ps(),_mm_setzero_ps() +avx512bitalg,avx512vl;immintrin.h;_mm_popcnt_epi16;_mm_setzero_si128() +avx512bw;immintrin.h;_mm512_abs_epi16;_mm512_setzero_si512() avx512cd;immintrin.h;_mm512_broadcastmb_epi64;__mmask8() -avx512dq;immintrin.h;_mm512_and_pd;__m512d(),__m512d() -avx512er;immintrin.h;_mm512_exp2a23_pd;__m512d() -avx512ifma;immintrin.h;_mm512_maskz_madd52hi_epu64;__mmask8(),__m512i(),__m512i(),__m512i() -avx512pf;immintrin.h;_mm512_prefetch_i32scatter_pd;NULL,__m256i(),(int)1,_MM_HINT_T0 -avx512vl,avx512f;immintrin.h;_mm_abs_epi64;__m128i() -avx512vpopcntdq,avx512vl;immintrin.h;_mm_popcnt_epi64;__m128i() -avx5124fmaps;immintrin.h;_mm_4fmadd_ss;__m128(),__m128(),__m128(),__m128(),__m128(),new __m128[1] -avx5124vnniw;immintrin.h;_mm512_4dpwssd_epi32;__m512i(),__m512i(),__m512i(),__m512i(),__m512i(),new __m128i[1] -avx512bf16,avx512vl;immintrin.h;_mm_cvtne2ps_pbh;__m128(),__m128() -avx512bitalg,avx512vl;immintrin.h;_mm_popcnt_epi16;__m128i() -avx512vbmi;immintrin.h;_mm512_permutex2var_epi8;__m512i(),__m512i(),__m512i() -avx512vbmi2,avx512vl;immintrin.h;_mm_mask_compress_epi16;__m128i(),__mmask8(),__m128i() -avx512vnni,avx512vl;immintrin.h;_mm_dpbusd_epi32;__m128i(),__m128i(),__m128i() -avx512vp2intersect,avx512vl;immintrin.h;_mm_2intersect_epi32;__m128i(),__m128i(),new __mmask8[1],new __mmask8[1] -avx512fp16,avx512vl;immintrin.h;_mm_add_ph;__m128h(),__m128h() +avx512dq;immintrin.h;_mm512_and_pd;_mm512_setzero_pd(),_mm512_setzero_pd() +avx512er;immintrin.h;_mm512_exp2a23_pd;_mm512_setzero_pd() +avx512f;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512() +avx512fp16,avx512vl;immintrin.h;_mm_add_ph;_mm_setzero_ph(),_mm_setzero_ph() +avx512ifma;immintrin.h;_mm512_maskz_madd52hi_epu64;__mmask8(),_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512() +avx512pf;immintrin.h;_mm512_prefetch_i32scatter_pd;NULL,_mm256_setzero_si256(),(int)1,_MM_HINT_T0 +avx512vbmi2,avx512vl;immintrin.h;_mm_mask_compress_epi16;_mm_setzero_si128(),__mmask8(),_mm_setzero_si128() +avx512vbmi;immintrin.h;_mm512_permutex2var_epi8;_mm512_setzero_si512(),_mm512_setzero_si512(),_mm512_setzero_si512() +avx512vl,avx512f;immintrin.h;_mm_abs_epi64;_mm_setzero_si128() +avx512vnni,avx512vl;immintrin.h;_mm_dpbusd_epi32;_mm_setzero_si128(),_mm_setzero_si128(),_mm_setzero_si128() +avx512vp2intersect,avx512vl;immintrin.h;_mm_2intersect_epi32;_mm_setzero_si128(),_mm_setzero_si128(),new __mmask8[1],new __mmask8[1] +avx512vpopcntdq,avx512vl;immintrin.h;_mm_popcnt_epi64;_mm_setzero_si128() # AMX -amx-bf16;immintrin.h;_tile_dpbf16ps;__tile(),__tile(),__tile() -amx-int8;immintrin.h;_tile_dpbssd;__tile(),__tile(),__tile() -amx-tile;immintrin.h;_tile_zero;__tile() +amx-bf16;immintrin.h;_tile_dpbf16ps;0,1,2 +amx-int8;immintrin.h;_tile_dpbssd;0,1,2 +amx-tile;immintrin.h;_tile_zero;0 # Other -adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1] -aes;wmmintrin.h;_mm_aesdec_si128;__m128i(),__m128i() -bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 +adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1];adcx +aes;wmmintrin.h;_mm_aesdec_si128;_mm_setzero_si128(),_mm_setzero_si128() bmi2;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1 +bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 cldemote;immintrin.h;_mm_cldemote;(void const*)NULL clflushopt;immintrin.h;_mm_clflushopt;(void const*)NULL clwb;immintrin.h;_mm_clwb;(void const*)NULL enqcmd;immintrin.h;_enqcmd;(void*)NULL,(void const*)NULL -fp16c;emmintrin.h;_mm_cvtph_ps;__m128i() +f16c;emmintrin.h;_mm_cvtph_ps;_mm_setzero_si128() fsgsbase;immintrin.h;_readfsbase_u32; fxsr;immintrin.h;_fxrstor;(void*)NULL -gfni,avx512vl;immintrin.h;_mm_gf2p8mul_epi8;__m128i(),__m128i() +gfni,avx512vl;immintrin.h;_mm_gf2p8mul_epi8;_mm_setzero_si128(),_mm_setzero_si128() hreset;immintrin.h;_hreset;1 invpcid;immintrin.h;_invpcid;(unsigned int)1,(void*)NULL -keylocker;immintrin.h;_mm_aesdec128kl_u8;new __m128i[1],__m128i(),(const void*)NULL +keylocker;immintrin.h;_mm_aesdec128kl_u8;new __m128i[1],_mm_setzero_si128(),(const void*)NULL keylocker_wide;immintrin.h;_mm_aesdecwide128kl_u8;new __m128i[1],(const __m128i*)new __m128i[1], (const void*)NULL lzcnt;immintrin.h;_lzcnt_u32;(unsigned int)1 monitor;pmmintrin.h;_mm_monitor;(void const*)NULL,(unsigned)1,(unsigned)1 @@ -84,25 +128,60 @@ movbe;immintrin.h;_loadbe_i16;(void const*)NULL movdir64b;immintrin.h;_movdir64b;(void*)NULL,(const void*)NULL movdiri;immintrin.h;_directstoreu_u32;(void*)NULL,(unsigned int)1 mpx;immintrin.h;_bnd_chk_ptr_lbounds;(const void*)NULL -pclmulqdq;wmmintrin.h;_mm_clmulepi64_si128;__m128i(),__m128i(),(const int)0 +pclmulqdq;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul pconfig;immintrin.h;_pconfig_u32;(const int)1,new size_t[1] -popcnt;immintrin.h;_popcnt32;(int)1 +pku;;exit;0 +popcnt;immintrin.h;_mm_popcnt_u32;(unsigned int)1 +prefetchw;;_m_prefetchw;(void*)NULL;prfchw prefetchwt1;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1 +ptwrite;immintrin.h;_ptwrite32;(unsigned int)0 rdpid;immintrin.h;_rdpid_u32; -rdrand;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1] +rdrnd;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1] rdseed;immintrin.h;_rdseed16_step;(unsigned short*)new unsigned short[1] rdtscp;immintrin.h;__rdtscp;(unsigned int*)NULL rtm;immintrin.h;_xend; serialize;immintrin.h;_serialize; -sha;immintrin.h;_mm_sha1msg1_epu32;__m128i(),__m128i() +sha;immintrin.h;_mm_sha1msg1_epu32;_mm_setzero_si128(),_mm_setzero_si128() tsc;immintrin.h;_rdtsc; tsxldtrk;immintrin.h;_xresldtrk; uintr;immintrin.h;_clui; -vaesavx512vl;immintrin.h;_mm256_aesdec_epi128;__m256i(),__m256i() -vpclmulqdq,avx512vl;immintrin.h;_mm256_clmulepi64_epi128;__m256i(),__m256i(),(const int)1 +vaes,avx512vl;immintrin.h;_mm256_aesdec_epi128;_mm256_setzero_si256(),_mm256_setzero_si256() +vpclmulqdq,avx512vl;immintrin.h;_mm256_clmulepi64_epi128;_mm256_setzero_si256(),_mm256_setzero_si256(),(const int)1 waitpkg;immintrin.h;_umonitor;(void*)NULL wbnoinvd;immintrin.h;_wbnoinvd; -xsave;immintrin.h;_xgetbv;(unsigned int)1 -xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned __int64)0 -xsaveopt,xsave;immintrin.h;xsaveopt;(void*)NULL,(unsigned __int64)0 -xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned __int64)0 +xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned __m256i)0 +xsaveopt,xsave;immintrin.h;xsaveopt;(void*)NULL,(unsigned __m256i)0 +xsaves;immintrin.h;_xgetbv;(unsigned int)1 +xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned __m256i)0 + +pop_disable:SunPro + +# Special checks for Oracle's SunPro compiler +# https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html +push_enable:SunPro + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +avx;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd() +avx2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256() +sse2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128() +sse3;pmmintrin.h;_mm_addsub_pd;_mm_setzero_pd(),_mm_setzero_pd() +sse4_1;smmintrin.h;_mm_max_epi32;_mm_setzero_si128(),_mm_setzero_si128();sse4.1 +sse4_2;nmmintrin.h;_mm_cmpgt_epi64;_mm_setzero_si128(),_mm_setzero_si128();sse4.2 +sse;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps() +ssse3;tmmintrin.h;_mm_hadd_epi16;_mm_setzero_si128(),_mm_setzero_si128() + +# AVX-512 +avx512;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512();avx512f +avx512;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1;prefetchwt1 + +# Other +avx_i;emmintrin.h;_mm_cvtph_ps;_mm_setzero_si128();f16c +aes;wmmintrin.h;_mm_aesdec_si128;_mm_setzero_si128(),_mm_setzero_si128();aes +aes;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul +avx2;immintrin.h;_lzcnt_u32;(unsigned int)1;lzcnt +sse4_2;immintrin.h;_mm_popcnt_u32;(unsigned int)1;popcnt +avx_i;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1;bmi +avx_i;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1;bmi2 +avx_i;immintrin.h;_readfsbase_u32;;fsgsbase +avx_i;immintrin.h;_rdrand16_step;(unsigned short*)new unsigned short[1];rdrnd +pop_enable:SunPro diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index c2daae408a..dd75b1df03 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -5,9 +5,11 @@ # # 1. Generate a list of available compiler flags for the specific CPU # -# 2. Enable/disable features based on compiler/linker capabilities +# 2. Enable/disable feature flags based on available CPU features, +# used-defined USE_ variables and the capabilities of the +# host system's compiler and linker # -# 3. Set compiler-specific flags (-m/-mno-) +# 3. Set compiler-specific flags (e.g., -m/-mno-) #============================================================================= include(ofa/AddCXXCompilerFlag) @@ -360,149 +362,225 @@ macro(OFA_HandleX86Options) if(NOT TARGET_ARCHITECTURE STREQUAL "none") set(_check_extension_list) - set(_disable_extension_list) - set(_enable_extension_list) - + set(_check_extension_flag_list) + set(_disable_extension_flag_list) + set(_enable_extension_flag_list) + + # Set enable/disable compiler flag prefixes + if(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + set(_enable "-xarch=") + set(_disable "-xarch=no-") + else() + set(_enable "-m") + set(_disable "-mno-") + endif() + # Step 2: Enable/disable feature flags based on available CPU # features, used-defined USE_ variables and # the capabilities of the host system's compiler and linker file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksX86.txt _checks) - string(REPLACE ";" "|" _checks "${_checks}") + string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") string(REPLACE "\n" ";" _checks "${_checks}") + set(_skip_check FALSE) + # Iterate over the list of checks line by line foreach (_check ${_checks}) + string(REPLACE "|" ";" _check "${_check}") + + # Parse for special lines if ("${_check}" MATCHES "^#" ) # Skip comment continue() + + elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block + list(GET _check 1 _push_enable_list) + _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check FALSE) + else() + list(PREPEND _skip_check TRUE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block + list(POP_FRONT _skip_check) + continue() + + elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block + list(GET _check 1 _push_disable_list) + _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check TRUE) + else() + list(PREPEND _skip_check FALSE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block + list(POP_FRONT _skip_check) + continue() endif() - # Extract extra CPU extensions, header files, function name, and parameters - string(REPLACE "|" ";" _check "${_check}") - list(GET _check 0 _check_extensions) + # Skip test? + list(GET _skip_check 0 _skip) + if(_skip) + continue() + endif() + + # Extract extra CPU extensions, header files, function name, and parameters + list(GET _check 0 _check_extension_flags) list(GET _check 1 _check_headers) list(GET _check 2 _check_function) list(GET _check 3 _check_params) - + # Convert list of extensions into compiler flags - string(REPLACE "," ";" _check_extensions "${_check_extensions}") - list(GET _check_extensions 0 _extension) - string(REPLACE ";" " -m" _check_flags "-m${_check_extensions}") - list(APPEND _check_extension_list "${_extension}") + string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") + list(GET _check_extension_flags 0 _extension_flag) + string(REPLACE ";" " ${_enable}" _check_flags "${_enable}${_check_extension_flags}") + list(APPEND _check_extension_flag_list "${_extension_flag}") + + # Extract optional extension alias + list(LENGTH _check _len) + if(${_len} EQUAL 5) + list(GET _check 4 _extension) + else() + set(_extension "${_extension_flag}") + endif() - # Define USE_<_extension> variable - set(_useVar "USE_${_extension}") + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension_flag> variable + set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") - # Set USE_<_extension> if not set externally + # If not specified externally, set the value of the + # USE_<_extension_flag> variable to TRUE if it is found in the list + # of available extensions and FALSE otherwise if(NOT DEFINED ${_useVar}) _ofa_find(_available_extension_list "${_extension}" _found) set(${_useVar} ${_found}) endif() - - # Apply compiler-specific fixes - if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") - # GNU GCC does not provide definitions for __int32 and __in64 - set(_code "#define __int32 long\n#define __int64 long long\nint main() { ${_check_function}(${_check_params})\; return 0\; }") - else() - set(_code "int main() { ${_check_function}(${_check_params})\; return 0\; }") - endif() - + if(${_useVar}) - # Check if the compiler supports the -m<_extension> flag and - # can compile the provided test code with it - AddCXXCompilerFlag("-m${_extension}" + # Check if the compiler supports the -m<_extension_flag> + # flag and can compile the provided test code with it + set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") + AddCXXCompilerFlag("${_enable}${_extension_flag}" EXTRA_FLAGS ${_check_flags} HEADERS ${_check_headers} CODE "${_code}" RESULT _ok) if(NOT ${_ok}) + # Test failed set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") else() - set(${_useVar} ${${_useVar}} CACHE BOOL "Use ${_extension} extension.") + # Test succeeded + set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") endif() else() - set(${_useVar} ${${_useVar}} CACHE BOOL "Use ${_extension} extension.") + # Disable extension without running tests + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") endif() mark_as_advanced(${_useVar}) endforeach() # Generate lists of enabled/disabled flags - list(REMOVE_DUPLICATES _check_extension_list) - foreach(_extension ${_check_extension_list}) - _ofa_find(_available_extension_list "${_extension}" _found) - set(_useVar "USE_${_extension}") + list(REMOVE_DUPLICATES _check_extension_flag_list) + foreach(_extension_flag ${_check_extension_flag_list}) + _ofa_find(_available_extension_list "${_extension_flag}" _found) + set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) string(REPLACE "." "_" _useVar "${_useVar}") + if(${_useVar}) - set(_haveVar "HAVE__m${_extension}") - string(REPLACE "." "_" _haveVar "${_haveVar}") + # Add <_extension_flag> to list of enabled extensions (if supported) + set(_haveVar "HAVE_${_enable}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") if(NOT ${_haveVar}) if(OFA_VERBOSE) - message(STATUS "[OFA] Ignoring -m${_extension} extension because checks failed") + message(STATUS "[OFA] Ignoring flag ${_enable}${_extension_flag} because checks failed") endif() continue() endif() - list(APPEND _enable_extension_list "${_extension}") + list(APPEND _enable_extension_flag_list "${_extension_flag}") else() - set(_haveVar "HAVE__mno_${_extension}") - string(REPLACE "." "_" _haveVar "${_haveVar}") + # Add <_extension_flag> to list of disabled extensions (if supported) + AddCXXCompilerFlag("${_disable}${_extension_flag}") + set(_haveVar "HAVE_${_disable}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") if(NOT ${_haveVar}) if(OFA_VERBOSE) - message(STATUS "[OFA] Ignoring -mno-${_extension} extension because checks failed") + message(STATUS "[OFA] Ignoring flag ${_disable}${_extension_flag} because checks failed") endif() continue() endif() - list(APPEND _disable_extension_list "${_extension}") + list(APPEND _disable_extension_flag_list "${_extension_flag}") endif() endforeach() if(OFA_VERBOSE) - if(_enable_extension_list) - list(SORT _enable_extension_list) - string(REPLACE ";" ", " _str "${_enable_extension_list}") + # Print enabled extension flags + if(_enable_extension_flag_list) + list(SORT _enable_extension_flag_list) + string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") string(TOUPPER ${_str} _str) message(STATUS "[OFA] Extensions (enabled): ${_str}") endif() - if(_disable_extension_list) - list(SORT _disable_extension_list) - string(REPLACE ";" ", " _str "${_disable_extension_list}") + # Print disabled extension flags + if(_disable_extension_flag_list) + list(SORT _disable_extension_flag_list) + string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") string(TOUPPER ${_str} _str) message(STATUS "[OFA] Extensions (disabled): ${_str}") endif() + # Print unhandled extension flags + set(_unhandled_extension_list) + foreach(_extension ${_available_extension_list}) + _ofa_find(_check_extension_list "${_extension}" _found) + if(NOT _found) + list(APPEND _unhandled_extension_list ${_extension}) + endif() + endforeach() + if(_unhandled_extension_list) + list(SORT _unhandled_extension_list) + string(REPLACE ";" ", " _str "${_unhandled_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (unhandled): ${_str}") + endif() endif() - # Add compiler flags + # Step 3: Set compiler-specific flags (e.g., -m/-mno-) if(MSVC AND MSVC_VERSION GREATER 1700) - _ofa_find(_enable_extension_list "avx512f" _found) + _ofa_find(_enable_extension_flag_list "avx512f" _found) if(_found) AddCXXCompilerFlag("/arch:AVX512" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() if(NOT _found) - _ofa_find(_enable_extension_list "avx2" _found) + _ofa_find(_enable_extension_flag_list "avx2" _found) if(_found) AddCXXCompilerFlag("/arch:AVX2" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() endif() if(NOT _found) - _ofa_find(_enable_extension_list "avx" _found) + _ofa_find(_enable_extension_flag_list "avx" _found) if(_found) AddCXXCompilerFlag("/arch:AVX" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() endif() if(NOT _found) - _ofa_find(_enable_extension_list "sse2" _found) + _ofa_find(_enable_extension_flag_list "sse2" _found) if(_found) AddCXXCompilerFlag("/arch:SSE2" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() if(NOT _found) - _ofa_find(_enable_extension_list "sse" _found) + _ofa_find(_enable_extension_flag_list "sse" _found) if(_found) AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) endif() endif() - foreach(_extension ${_enable_extension_list}) + foreach(_extension ${_enable_extension_flag_list}) string(TOUPPER "${_extension}" _extension) string(REPLACE "." "_" _extension "__${_extension}__") add_definitions("-D${_extension}") @@ -591,17 +669,32 @@ macro(OFA_HandleX86Options) endif() # Set -m<_extension> flag for enabled features - foreach(_extension ${_enable_extension_list}) - AddCXXCompilerFlag("-m${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + foreach(_extension ${_enable_extension_flag_list}) + AddCXXCompilerFlag("${_enable}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_extension) # Set -mno-<_extension> flag for disabled features - foreach(_extension ${_disable_extension_list}) - AddCXXCompilerFlag("-mno-${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + foreach(_extension ${_disable_extension_flag_list}) + AddCXXCompilerFlag("${_disable}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_extension) - # TODO PGI/Cray/SunPro ... + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + # Set -xtarget flag + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("-xtarget=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Set -xarch= flag for enabled features + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-xarch=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + + # TODO PGI/Cray .. + else() # Others: GNU, Clang and variants @@ -614,12 +707,12 @@ macro(OFA_HandleX86Options) endforeach(_flag) # Set -m flag for enabled features - foreach(_flag ${_enable_extension_list}) + foreach(_flag ${_enable_extension_flag_list}) AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_flag) # Set -mno-feature flag for disabled features - foreach(_flag ${_disable_extension_list}) + foreach(_flag ${_disable_extension_flag_list}) AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_flag) endif() From 075da697327e6ef1102475d7bb2a6e811790a6d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 21 Jan 2022 19:15:47 +0100 Subject: [PATCH 145/174] Updated OFA --- cmake/ofa/ChecksArm.txt | 99 ++ cmake/ofa/ChecksX86.txt | 26 +- cmake/ofa/HandleArmOptions.cmake | 1857 ++++++++++++++++-------------- cmake/ofa/HandleX86Options.cmake | 667 +++++------ 4 files changed, 1432 insertions(+), 1217 deletions(-) create mode 100644 cmake/ofa/ChecksArm.txt diff --git a/cmake/ofa/ChecksArm.txt b/cmake/ofa/ChecksArm.txt new file mode 100644 index 0000000000..85745097cf --- /dev/null +++ b/cmake/ofa/ChecksArm.txt @@ -0,0 +1,99 @@ +# List of arm/arm64 checks + +# FORMAT: +# [,];;;;[] +# +# lines starting with # are comments +# lines starting with push_enable: start a block of tests enabled for the given compilers only +# lines starting with pop_enable: ends a block of tests enabled for the given compilers only +# lines starting with push_disable: start a block of tests disabled for the given compilers +# lines starting with pop_disable: ends a block of tests disabled for the given compilers + +# DESCRIPTION: +# For each line of this file, HandleArmOptions generates the code snipped +# +# #include +# #include +# ... +# int main { +# name(parameter0, parameter1, ...); +# return 0; +# } +# +# and compiles it with, e.g. +# +# gcc -m -m +# +# if the extension should be enabled and +# +# gcc -m-no -m-no +# +# if the extension should be disabled. In the above example, the +# compiler name 'gcc' and the flag prefixes '-m' and '-mno-' will be +# set properly by HandleX86Options. +# +# EXTENSION ALIAS: +# By default, it is assumed that the name of the extension, e.g., +# avx512f coinsides with the name of the compiler flag to be used to +# enable/disable it, e.g., -mno-avx512f. Some compilers like Oracle's +# SunPro have non-canonical naming conventions, +# cf. https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html. +# +# In this case, the optional parameter can be used +# to specify the name of the extension as reported by the system, +# whereas the compiler-specific extension flag(s) are given in +# and [], respectively. +# +# ENABLING/DISABLING OF CHECKS: +# Checks can be explicitly disabled for particular compilers by placing +# them inside a push_disable/pop_disable block, e.g. +# +# push_disable:SunPro,IntelLLVM +# +# pop_disable:SunPro +# +# Similarly, checks can be explicitly enabled for particular compilers +# by placing them inside a push_disable/pop_disable block, e.g. +# +# push_enable:SunPro +# +# pop_enable:SunPro + +aes;cstdlib;exit;0 +bf16;cstdlib;exit;0 +crc;cstdlib;exit;0 +crypto;cstdlib;exit;0 +dotprod;cstdlib;exit;0 +dsp;cstdlib;exit;0 +fp;cstdlib;exit;0 +fp16;cstdlib;exit;0 +fp16fml;cstdlib;exit;0 +fd_dp;cstdlib;exit;0 +fp_sp;cstdlib;exit;0 +i8mm;cstdlib;exit;0 +idiv;cstdlib;exit;0 +lse;cstdlib;exit;0 +mve;cstdlib;exit;0 +mve_fp;cstdlib;exit;0 +neon;cstdlib;exit;0 +neon_fp16;cstdlib;exit;0 +neon_vfpv4;cstdlib;exit;0 +ras;cstdlib;exit;0 +rcpc;cstdlib;exit;0 +rdm;cstdlib;exit;0 +rdma;cstdlib;exit;0 +sec;cstdlib;exit;0 +sha2;cstdlib;exit;0 +sha3;cstdlib;exit;0 +simd;cstdlib;exit;0 +sm4;cstdlib;exit;0 +sve;cstdlib;exit;0 +sve2;cstdlib;exit;0 +vfpv3;cstdlib;exit;0 +vfpv3_d16;cstdlib;exit;0 +vfpv3_d16_fp16;cstdlib;exit;0 +vfpv3_fp16;cstdlib;exit;0 +vfpv4;cstdlib;exit;0 +vfpv4_d16;cstdlib;exit;0 +zcm;cstdlib;exit;0 +zcz;cstdlib;exit;0 diff --git a/cmake/ofa/ChecksX86.txt b/cmake/ofa/ChecksX86.txt index 9edc1e9a4c..2acc303ec9 100644 --- a/cmake/ofa/ChecksX86.txt +++ b/cmake/ofa/ChecksX86.txt @@ -1,4 +1,4 @@ -# List of x86 checks +# List of x86/x86_64 checks # FORMAT: # [,];;;;[] @@ -48,9 +48,9 @@ # Checks can be explicitly disabled for particular compilers by placing # them inside a push_disable/pop_disable block, e.g. # -# push_disable:SunPro,IntelLLVM +# push_disable:MSVC,SunPro # -# pop_disable:SunPro +# pop_disable:MSVC,SunPro # # Similarly, checks can be explicitly enabled for particular compilers # by placing them inside a push_disable/pop_disable block, e.g. @@ -59,8 +59,8 @@ # # pop_enable:SunPro -# Oracle's SunPro compiler fails these checks -push_disable:SunPro +# MSVC and Oracle's SunPro compiler fails these checks +push_disable:MSVC,SunPro # MMX mmx;mmintrin.h;_mm_add_pi16;__m64(),__m64() @@ -154,7 +154,21 @@ xsaveopt,xsave;immintrin.h;xsaveopt;(void*)NULL,(unsigned __m256i)0 xsaves;immintrin.h;_xgetbv;(unsigned int)1 xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned __m256i)0 -pop_disable:SunPro +pop_disable:MSVC,SunPro + +# Special checks for the MSVC compiler +push_enable:MSVC + +# SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +SSE;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps();sse +SSE2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128();sse2 +AVX;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd();avx +AVX2;immintrin.h;_mm256_hadd_epi16;_mm256_setzero_si256(),_mm256_setzero_si256();avx2 + +# AVX-512 +AVX512;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512();avx512f + +pop_enable:MSVC # Special checks for Oracle's SunPro compiler # https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index 6daab85fc9..20d5d56daa 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -1,922 +1,1019 @@ #============================================================================= # Handling of ARM / ARM64 options # -# This is a two-step process: +# This is a three-step process: # -# 1. Generate a list of compiler flags for the specific CPU +# 1. Generate a list of available compiler flags for the specific CPU # -# 2. Special compiler-specific treatment of "native" flag +# 2. Enable/disable feature flags based on available CPU features, +# used-defined USE_ variables and the capabilities of the +# host system's compiler and linker # -# 3. Disabling of "broken" features based on OFA_xxx_INTRINSICS_BROKEN options -# -# 4. Set compiler-specific flags +# 3. Set compiler-specific flags (e.g., -m/-mno-) #============================================================================= -include(ofa/AddCompilerFlag) +include(ofa/AddCXXCompilerFlag) include(ofa/CommonMacros) include(CheckIncludeFileCXX) macro(OFA_HandleArmOptions) - set(_march_flag_list) - set(_mtune_flag_list) - set(_available_vector_units_list) - - # ARM - if(TARGET_ARCHITECTURE STREQUAL "strongarm") - list(APPEND _mtune_flag_list "strongarm") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm8") - list(APPEND _mtune_flag_list "arm8") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm810") - list(APPEND _mtune_flag_list "arm810") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa526") - list(APPEND _mtune_flag_list "fa526") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626") - list(APPEND _mtune_flag_list "fa626") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") - list(APPEND _mtune_flag_list "arm7tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") - list(APPEND _mtune_flag_list "arm7tdmi-s") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") - list(APPEND _mtune_flag_list "arm710t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") - list(APPEND _mtune_flag_list "arm720t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") - list(APPEND _mtune_flag_list "arm740t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9") - list(APPEND _mtune_flag_list "arm9") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") - list(APPEND _mtune_flag_list "arm9tdmi") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920") - list(APPEND _mtune_flag_list "arm920") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") - list(APPEND _mtune_flag_list "arm920t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") - list(APPEND _mtune_flag_list "arm922t") - list(APPEND _march_flag_list "armv4t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") - list(APPEND _mtune_flag_list "arm940t") - list(APPEND _march_flag_list "armv4t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") - list(APPEND _mtune_flag_list "arm1020t") - list(APPEND _march_flag_list "armv5t") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") - list(APPEND _mtune_flag_list "arm10tdmi") - list(APPEND _march_flag_list "armv5t") - - elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") - list(APPEND _mtune_flag_list "arm9e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") - list(APPEND _mtune_flag_list "arm946e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") - list(APPEND _mtune_flag_list "arm966e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") - list(APPEND _mtune_flag_list "arm968e-s") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") - list(APPEND _mtune_flag_list "arm10e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") - list(APPEND _mtune_flag_list "arm1020e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") - list(APPEND _mtune_flag_list "arm1022e") - list(APPEND _march_flag_list "armv5te") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "xscale") - list(APPEND _mtune_flag_list "xscale") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") - list(APPEND _mtune_flag_list "iwmmxt") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") - list(APPEND _mtune_flag_list "iwmmxt2") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") - list(APPEND _mtune_flag_list "fa606te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") - list(APPEND _mtune_flag_list "fa626te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") - list(APPEND _mtune_flag_list "fmp626") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") - list(APPEND _mtune_flag_list "fa726te") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") - list(APPEND _mtune_flag_list "arm926ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") - list(APPEND _mtune_flag_list "arm1026ej-s") - list(APPEND _march_flag_list "armv5tej") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") - list(APPEND _mtune_flag_list "mpcore") - list(APPEND _march_flag_list "armv6k") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") - list(APPEND _mtune_flag_list "arm1136j-s") - list(APPEND _march_flag_list "armv6j") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") - list(APPEND _mtune_flag_list "arm1136jf-s") - list(APPEND _march_flag_list "armv6j") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") - list(APPEND _mtune_flag_list "arm1156t2-s") - list(APPEND _march_flag_list "armv6t2") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") - list(APPEND _mtune_flag_list "arm1156t2f-s") - list(APPEND _march_flag_list "armv6t2") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") - list(APPEND _mtune_flag_list "arm1176jz-s") - list(APPEND _march_flag_list "armv6kz") - elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") - list(APPEND _mtune_flag_list "arm1176jzf-s") - list(APPEND _march_flag_list "armv6kz") - list(APPEND _available_vector_units_list "fp") - - elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") - list(APPEND _mtune_flag_list "generic-armv7-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") - list(APPEND _mtune_flag_list "cortex-a78") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") - list(APPEND _mtune_flag_list "cortex-a5") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") - list(APPEND _mtune_flag_list "cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") - list(APPEND _mtune_flag_list "cortex-a8") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "sec" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") - list(APPEND _mtune_flag_list "cortex-a9") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "neon-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") - list(APPEND _mtune_flag_list "cortex-a12") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") - list(APPEND _mtune_flag_list "cortex-a15") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") - list(APPEND _mtune_flag_list "cortex-a17") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") - list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") - list(APPEND _march_flag_list "armv7ve") - list(APPEND _available_vector_units_list "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") - list(APPEND _mtune_flag_list "cortex-a32") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") - list(APPEND _mtune_flag_list "cortex-a34") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") - list(APPEND _mtune_flag_list "cortex-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") - list(APPEND _mtune_flag_list "cortex-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") - list(APPEND _mtune_flag_list "cortex-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") - list(APPEND _mtune_flag_list "cortex-a57") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") - list(APPEND _mtune_flag_list "cortex-a72") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") - list(APPEND _mtune_flag_list "cortex-a73") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") - list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "simd") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") - list(APPEND _mtune_flag_list "cortex-a75") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") - list(APPEND _mtune_flag_list "cortex-a76") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") - list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") - list(APPEND _mtune_flag_list "cortex-a76ae") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") - list(APPEND _mtune_flag_list "cortex-a77") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") - list(APPEND _mtune_flag_list "cortex-a78") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") - list(APPEND _mtune_flag_list "cortex-a78ae") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") - list(APPEND _mtune_flag_list "cortex-a78c") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") - list(APPEND _mtune_flag_list "cortex-a510") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") - list(APPEND _mtune_flag_list "cortex-a710") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") - list(APPEND _mtune_flag_list "cortex-m0") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") - list(APPEND _mtune_flag_list "cortex-m0plus") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") - list(APPEND _mtune_flag_list "cortex-m1") - list(APPEND _march_flag_list "armv6s-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") - list(APPEND _mtune_flag_list "cortex-m3") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") - list(APPEND _mtune_flag_list "cortex-m4") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") - list(APPEND _mtune_flag_list "cortex-m7") - list(APPEND _march_flag_list "armv7e-m") - list(APPEND _available_vector_units_list "fp.dp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") - list(APPEND _mtune_flag_list "cortex-m23") - list(APPEND _march_flag_list "armv8-m.base") - list(APPEND _march_flag_list "armv7-m") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") - list(APPEND _mtune_flag_list "cortex-m33") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") - list(APPEND _mtune_flag_list "cortex-m35p") - list(APPEND _march_flag_list "armv8-m.main") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "dsp" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") - list(APPEND _mtune_flag_list "cortex-m55") - list(APPEND _march_flag_list "armv8.1-m.main") - list(APPEND _march_flag_list "armv8-m") - list(APPEND _march_flag_list "armv7-m") - list(APPEND _available_vector_units_list "mve.fp" "fp.dp") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") - list(APPEND _mtune_flag_list "cortex-r4") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") - list(APPEND _mtune_flag_list "cortex-r4f") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") - list(APPEND _mtune_flag_list "cortex-r5") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "fp") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") - list(APPEND _mtune_flag_list "cortex-r7") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "idiv" "vfpv3-d16-fp16") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") - list(APPEND _mtune_flag_list "cortex-r8") - list(APPEND _march_flag_list "armv7-r") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") - list(APPEND _mtune_flag_list "cortex-r52") - list(APPEND _march_flag_list "armv8-r") - list(APPEND _march_flag_list "armv7-r") - list(APPEND _available_vector_units_list "crc" "simd" "idiv" "vfpv3-d16-fp16") - - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") - list(APPEND _mtune_flag_list "cortex-x1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") - list(APPEND _march_flag_list "armv9-a") - list(APPEND _march_flag_list "armv8.6-a") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") - list(APPEND _mtune_flag_list "neoverse-e1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") - list(APPEND _mtune_flag_list "neoverse-n1") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "dotprod") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") - list(APPEND _mtune_flag_list "neoverse-n2") - list(APPEND _march_flag_list "armv8.5-a") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") - list(APPEND _mtune_flag_list "neoverse-v1") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "bf16" "fp16" "i8mm") - - # Broadcom - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") - list(APPEND _mtune_flag_list "brahma-b15") - elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") - list(APPEND _mtune_flag_list "brahma-b53") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") - list(APPEND _mtune_flag_list "thunderx2") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto") - - # Cavium - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") - list(APPEND _mtune_flag_list "thunderx") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") - list(APPEND _mtune_flag_list "thunderxt88") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") - list(APPEND _mtune_flag_list "thunderxt81") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") - list(APPEND _mtune_flag_list "thunderxt83") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") - list(APPEND _mtune_flag_list "thunderx2t99") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto") - - # DEC - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") - list(APPEND _mtune_flag_list "strongarm110") - list(APPEND _march_flag_list "armv4") - elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") - list(APPEND _mtune_flag_list "strongarm1100") - list(APPEND _march_flag_list "armv4") - - # FUJITSU - elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") - list(APPEND _mtune_flag_list "a64fx") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "fp16" "sve") - - # HiSilicon - elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") - list(APPEND _mtune_flag_list "tsv110") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp16" "sha2") - - # Nvidia - elseif(TARGET_ARCHITECTURE STREQUAL "denver") - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "denver2") - list(APPEND _mtune_flag_list "denver2") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - elseif(TARGET_ARCHITECTURE STREQUAL "carmel") - list(APPEND _mtune_flag_list "denver") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - # APM - elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") - list(APPEND _mtune_flag_list "xgene1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - - # Qualcomm - elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") - list(APPEND _mtune_flag_list "scorpion") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "krait") - list(APPEND _mtune_flag_list "krait") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo") - list(APPEND _mtune_flag_list "kryo") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") - list(APPEND _mtune_flag_list "kryo2") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "falkor") - list(APPEND _mtune_flag_list "falkor") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "saphira") - list(APPEND _mtune_flag_list "saphira") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crc" "crypto" "simd" "vfpv3" "vfpv4") - - # Samsung - elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") - list(APPEND _mtune_flag_list "exynos-m1") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "crypto" "simd") - - # Marvell - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") - list(APPEND _mtune_flag_list "marvell-f") - list(APPEND _march_flag_list "armv5te") - elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") - list(APPEND _mtune_flag_list "marvell-pj4") - list(APPEND _march_flag_list "armv7-a") - list(APPEND _available_vector_units_list "mp" "sec" "fp") - - # Intel - elseif(TARGET_ARCHITECTURE STREQUAL "i80200") - list(APPEND _mtune_flag_list "i80200") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") - list(APPEND _mtune_flag_list "pxa250a") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") - list(APPEND _mtune_flag_list "pxa210a") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") - list(APPEND _mtune_flag_list "i80321-400") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") - list(APPEND _mtune_flag_list "i80321-600") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") - list(APPEND _mtune_flag_list "pxa250b") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") - list(APPEND _mtune_flag_list "pxa210b") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") - list(APPEND _mtune_flag_list "i80321-400-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") - list(APPEND _mtune_flag_list "i80321-600-b0") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") - list(APPEND _mtune_flag_list "pxa250c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") - list(APPEND _mtune_flag_list "pxa210c") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") - list(APPEND _mtune_flag_list "pxa27x") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") - list(APPEND _mtune_flag_list "ipx425-533") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") - list(APPEND _mtune_flag_list "ipx425-400") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") - list(APPEND _mtune_flag_list "ipx425-266") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") - list(APPEND _mtune_flag_list "pxa32x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") - list(APPEND _mtune_flag_list "pxa930") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") - list(APPEND _mtune_flag_list "pxa30x") - elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") - list(APPEND _mtune_flag_list "pxa31x") - elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") - list(APPEND _mtune_flag_list "sa1110") - elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") - list(APPEND _mtune_flag_list "ipx1200") - - # Apple - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") - list(APPEND _mtune_flag_list "apple-a6") - list(APPEND _march_flag_list "armv7-a") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") - list(APPEND _mtune_flag_list "apple-a7") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") - list(APPEND _mtune_flag_list "apple-a8") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") - list(APPEND _mtune_flag_list "apple-a9") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") - list(APPEND _mtune_flag_list "apple-a10") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") - list(APPEND _mtune_flag_list "apple-a11") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") - list(APPEND _mtune_flag_list "apple-a12") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") - list(APPEND _mtune_flag_list "apple-a13") - list(APPEND _march_flag_list "armv8.4-a") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") - elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") - list(APPEND _mtune_flag_list "vortex") - list(APPEND _march_flag_list "armv8.3-a") - list(APPEND _march_flag_list "armv8.2-a") - list(APPEND _march_flag_list "armv8.1-a") - list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_vector_units_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") - - # Others - elseif(TARGET_ARCHITECTURE STREQUAL "generic") - list(APPEND _march_flag_list "generic") - elseif(TARGET_ARCHITECTURE STREQUAL "native") - list(APPEND _march_flag_list "native") - elseif(TARGET_ARCHITECTURE STREQUAL "none") - # add this clause to remove it from the else clause - - else() - message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") - endif() # Special treatment for "native" if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) - message(FATAL_ERROR, "MSVC does not support \"native\" flag.") + message(FATAL_ERROR "[OFA] MSVC does not support \"native\" flag.") elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") - # NVidia HPC / PGI (on Linux/Windows - AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + # NVidia HPC / PGI (on Linux/Windows) + AddCompilerFlag("-tp=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") # Cray (on Linux) - message(FATAL_ERROR, "Cray compiler does not support \"native\" flag.") + message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") else() # Others: GNU, Clang and variants - AddCompilerFlag("-mcpu=native" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-mcpu=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + endif() + + if(NOT _ok) + message(FATAL_ERROR "[OFA] An error occured while setting the \"native\" flag.") endif() - - # Apply architecture flags + elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") - # Disable "broken" features based on OFA_xxx_INTRINSICS_BROKEN options - set(_disable_vector_unit_list) - set(_enable_vector_unit_list) - - # TODO: Add OFA_xxx_INTRINSICS_BROKEN rules - set(_aes_broken false) - set(_bf16_broken false) - set(_crc_broken false) - set(_crypto_broken false) - set(_dotprod_broken false) - set(_dsp_broken false) - set(_fp16_broken false) - set(_fp16fml_broken false) - set(_fp_broken false) - set(_fp_dp_broken false) - set(_fp_sp_broken false) - set(_i8mm_broken false) - set(_idiv_broken false) - set(_lse_broken false) - set(_mve_broken false) - set(_mve_fp_broken false) - set(_neon_broken false) - set(_neon_fp16_broken false) - set(_neon_vfpv4_broken false) - set(_ras_broken false) - set(_rcpc_broken false) - set(_rdm_broken false) - set(_rdma_broken false) - set(_sec_broken false) - set(_sha2_broken false) - set(_sha3_broken false) - set(_simd_broken false) - set(_sm4_broken false) - set(_sve_broken false) - set(_vfpv3_broken false) - set(_vfpv3_d16_broken false) - set(_vfpv3_d16_fp16_broken false) - set(_vfpv3_fp16_broken false) - set(_vfpv4_broken false) - set(_vfpv4_d16_broken false) - set(_zcm_broken false) - set(_zcz_broken false) - - # Enable/disable macro - macro(_enable_or_disable _name _flag _documentation _broken) - if(_broken) - set(_found false) - else() - _ofa_find(_available_vector_units_list "${_flag}" _found) + # Step 1: Generate a list of compiler flags for the specific CPU + set(_march_flag_list) + set(_mtune_flag_list) + set(_available_extension_list) + + # ARM + if(TARGET_ARCHITECTURE STREQUAL "strongarm") + list(APPEND _mtune_flag_list "strongarm") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm8") + list(APPEND _mtune_flag_list "arm8") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm810") + list(APPEND _mtune_flag_list "arm810") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa526") + list(APPEND _mtune_flag_list "fa526") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626") + list(APPEND _mtune_flag_list "fa626") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi") + list(APPEND _mtune_flag_list "arm7tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm7tdmi-s") + list(APPEND _mtune_flag_list "arm7tdmi-s") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm710t") + list(APPEND _mtune_flag_list "arm710t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm720t") + list(APPEND _mtune_flag_list "arm720t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm740t") + list(APPEND _mtune_flag_list "arm740t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9") + list(APPEND _mtune_flag_list "arm9") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm9tdmi") + list(APPEND _mtune_flag_list "arm9tdmi") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920") + list(APPEND _mtune_flag_list "arm920") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm920t") + list(APPEND _mtune_flag_list "arm920t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm922t") + list(APPEND _mtune_flag_list "arm922t") + list(APPEND _march_flag_list "armv4t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm940t") + list(APPEND _mtune_flag_list "arm940t") + list(APPEND _march_flag_list "armv4t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020t") + list(APPEND _mtune_flag_list "arm1020t") + list(APPEND _march_flag_list "armv5t") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10tdmi") + list(APPEND _mtune_flag_list "arm10tdmi") + list(APPEND _march_flag_list "armv5t") + + elseif(TARGET_ARCHITECTURE STREQUAL "arm9e") + list(APPEND _mtune_flag_list "arm9e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm946e-s") + list(APPEND _mtune_flag_list "arm946e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm966e-s") + list(APPEND _mtune_flag_list "arm966e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm968e-s") + list(APPEND _mtune_flag_list "arm968e-s") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm10e") + list(APPEND _mtune_flag_list "arm10e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1020e") + list(APPEND _mtune_flag_list "arm1020e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1022e") + list(APPEND _mtune_flag_list "arm1022e") + list(APPEND _march_flag_list "armv5te") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "xscale") + list(APPEND _mtune_flag_list "xscale") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt") + list(APPEND _mtune_flag_list "iwmmxt") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "iwmmxt2") + list(APPEND _mtune_flag_list "iwmmxt2") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa606te") + list(APPEND _mtune_flag_list "fa606te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa626te") + list(APPEND _mtune_flag_list "fa626te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fmp626") + list(APPEND _mtune_flag_list "fmp626") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "fa726te") + list(APPEND _mtune_flag_list "fa726te") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "arm926ej-s") + list(APPEND _mtune_flag_list "arm926ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1026ej-s") + list(APPEND _mtune_flag_list "arm1026ej-s") + list(APPEND _march_flag_list "armv5tej") + list(APPEND _available_extension_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "mpcore") + list(APPEND _mtune_flag_list "mpcore") + list(APPEND _march_flag_list "armv6k") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136j-s") + list(APPEND _mtune_flag_list "arm1136j-s") + list(APPEND _march_flag_list "armv6j") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1136jf-s") + list(APPEND _mtune_flag_list "arm1136jf-s") + list(APPEND _march_flag_list "armv6j") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2-s") + list(APPEND _mtune_flag_list "arm1156t2-s") + list(APPEND _march_flag_list "armv6t2") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1156t2f-s") + list(APPEND _mtune_flag_list "arm1156t2f-s") + list(APPEND _march_flag_list "armv6t2") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jz-s") + list(APPEND _mtune_flag_list "arm1176jz-s") + list(APPEND _march_flag_list "armv6kz") + elseif(TARGET_ARCHITECTURE STREQUAL "arm1176jzf-s") + list(APPEND _mtune_flag_list "arm1176jzf-s") + list(APPEND _march_flag_list "armv6kz") + list(APPEND _available_extension_list "fp") + + elseif(TARGET_ARCHITECTURE STREQUAL "generic-armv7-a") + list(APPEND _mtune_flag_list "generic-armv7-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "vfpv3-d16" "vfpv3" "vfpv3-d16-fp16" "vfpv3-fp16" "vfpv4-d16" "vfpv4" "simd" "neon-fp16" "neon-vfpv4") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a5") + list(APPEND _mtune_flag_list "cortex-a5") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a7") + list(APPEND _mtune_flag_list "cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a8") + list(APPEND _mtune_flag_list "cortex-a8") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "sec" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a9") + list(APPEND _mtune_flag_list "cortex-a9") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "neon-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a12") + list(APPEND _mtune_flag_list "cortex-a12") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15") + list(APPEND _mtune_flag_list "cortex-a15") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a15.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a15.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17") + list(APPEND _mtune_flag_list "cortex-a17") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a17.cortex-a7") + list(APPEND _mtune_flag_list "cortex-a17.cortex-a7") + list(APPEND _march_flag_list "armv7ve") + list(APPEND _available_extension_list "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a32") + list(APPEND _mtune_flag_list "cortex-a32") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a34") + list(APPEND _mtune_flag_list "cortex-a34") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a35") + list(APPEND _mtune_flag_list "cortex-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a53") + list(APPEND _mtune_flag_list "cortex-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a55") + list(APPEND _mtune_flag_list "cortex-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57") + list(APPEND _mtune_flag_list "cortex-a57") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a57.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a57.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72") + list(APPEND _mtune_flag_list "cortex-a72") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a72.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a72.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73") + list(APPEND _mtune_flag_list "cortex-a73") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a35") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a35") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a73.cortext-a53") + list(APPEND _mtune_flag_list "cortex-a73.cortext-a53") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "simd") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75") + list(APPEND _mtune_flag_list "cortex-a75") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a75.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a75.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76") + list(APPEND _mtune_flag_list "cortex-a76") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76.cortext-a55") + list(APPEND _mtune_flag_list "cortex-a76.cortext-a55") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a76ae") + list(APPEND _mtune_flag_list "cortex-a76ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a77") + list(APPEND _mtune_flag_list "cortex-a77") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78") + list(APPEND _mtune_flag_list "cortex-a78") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78ae") + list(APPEND _mtune_flag_list "cortex-a78ae") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a78c") + list(APPEND _mtune_flag_list "cortex-a78c") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a510") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-a710") + list(APPEND _mtune_flag_list "cortex-a710") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0") + list(APPEND _mtune_flag_list "cortex-m0") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m0plus") + list(APPEND _mtune_flag_list "cortex-m0plus") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m1") + list(APPEND _mtune_flag_list "cortex-m1") + list(APPEND _march_flag_list "armv6s-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m3") + list(APPEND _mtune_flag_list "cortex-m3") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m4") + list(APPEND _mtune_flag_list "cortex-m4") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m7") + list(APPEND _mtune_flag_list "cortex-m7") + list(APPEND _march_flag_list "armv7e-m") + list(APPEND _available_extension_list "fp.dp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m23") + list(APPEND _mtune_flag_list "cortex-m23") + list(APPEND _march_flag_list "armv8-m.base") + list(APPEND _march_flag_list "armv7-m") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m33") + list(APPEND _mtune_flag_list "cortex-m33") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m35p") + list(APPEND _mtune_flag_list "cortex-m35p") + list(APPEND _march_flag_list "armv8-m.main") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "dsp" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-m55") + list(APPEND _mtune_flag_list "cortex-m55") + list(APPEND _march_flag_list "armv8.1-m.main") + list(APPEND _march_flag_list "armv8-m") + list(APPEND _march_flag_list "armv7-m") + list(APPEND _available_extension_list "mve.fp" "fp.dp") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4") + list(APPEND _mtune_flag_list "cortex-r4") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r4f") + list(APPEND _mtune_flag_list "cortex-r4f") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r5") + list(APPEND _mtune_flag_list "cortex-r5") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "idiv" "fp") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r7") + list(APPEND _mtune_flag_list "cortex-r7") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "idiv" "vfpv3-d16-fp16") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r8") + list(APPEND _mtune_flag_list "cortex-r8") + list(APPEND _march_flag_list "armv7-r") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-r52") + list(APPEND _mtune_flag_list "cortex-r52") + list(APPEND _march_flag_list "armv8-r") + list(APPEND _march_flag_list "armv7-r") + list(APPEND _available_extension_list "crc" "simd" "idiv" "vfpv3-d16-fp16") + + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x1") + list(APPEND _mtune_flag_list "cortex-x1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "cortex-x2") + list(APPEND _march_flag_list "armv9-a") + list(APPEND _march_flag_list "armv8.6-a") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-e1") + list(APPEND _mtune_flag_list "neoverse-e1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n1") + list(APPEND _mtune_flag_list "neoverse-n1") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "dotprod") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-n2") + list(APPEND _mtune_flag_list "neoverse-n2") + list(APPEND _march_flag_list "armv8.5-a") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + elseif(TARGET_ARCHITECTURE STREQUAL "neoverse-v1") + list(APPEND _mtune_flag_list "neoverse-v1") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "bf16" "fp16" "i8mm") + + # Broadcom + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b15") + list(APPEND _mtune_flag_list "brahma-b15") + elseif(TARGET_ARCHITECTURE STREQUAL "brahma-b53") + list(APPEND _mtune_flag_list "brahma-b53") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2") + list(APPEND _mtune_flag_list "thunderx2") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crypto") + + # Cavium + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx") + list(APPEND _mtune_flag_list "thunderx") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt88") + list(APPEND _mtune_flag_list "thunderxt88") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt81") + list(APPEND _mtune_flag_list "thunderxt81") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderxt83") + list(APPEND _mtune_flag_list "thunderxt83") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + elseif(TARGET_ARCHITECTURE STREQUAL "thunderx2t99") + list(APPEND _mtune_flag_list "thunderx2t99") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto") + + # DEC + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm110") + list(APPEND _mtune_flag_list "strongarm110") + list(APPEND _march_flag_list "armv4") + elseif(TARGET_ARCHITECTURE STREQUAL "strongarm1100") + list(APPEND _mtune_flag_list "strongarm1100") + list(APPEND _march_flag_list "armv4") + + # FUJITSU + elseif(TARGET_ARCHITECTURE STREQUAL "a64fx") + list(APPEND _mtune_flag_list "a64fx") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "fp16" "sve") + + # HiSilicon + elseif(TARGET_ARCHITECTURE STREQUAL "tsv110") + list(APPEND _mtune_flag_list "tsv110") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "aes" "crypto" "fp16" "sha2") + + # Nvidia + elseif(TARGET_ARCHITECTURE STREQUAL "denver") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "denver2") + list(APPEND _mtune_flag_list "denver2") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + elseif(TARGET_ARCHITECTURE STREQUAL "carmel") + list(APPEND _mtune_flag_list "denver") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # APM + elseif(TARGET_ARCHITECTURE STREQUAL "xgene1") + list(APPEND _mtune_flag_list "xgene1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + + # Qualcomm + elseif(TARGET_ARCHITECTURE STREQUAL "scorpion") + list(APPEND _mtune_flag_list "scorpion") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "krait") + list(APPEND _mtune_flag_list "krait") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo") + list(APPEND _mtune_flag_list "kryo") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "kryo2") + list(APPEND _mtune_flag_list "kryo2") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "falkor") + list(APPEND _mtune_flag_list "falkor") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "saphira") + list(APPEND _mtune_flag_list "saphira") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crc" "crypto" "simd" "vfpv3" "vfpv4") + + # Samsung + elseif(TARGET_ARCHITECTURE STREQUAL "exynos-m1") + list(APPEND _mtune_flag_list "exynos-m1") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "crypto" "simd") + + # Marvell + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-f") + list(APPEND _mtune_flag_list "marvell-f") + list(APPEND _march_flag_list "armv5te") + elseif(TARGET_ARCHITECTURE STREQUAL "marvell-pj4") + list(APPEND _mtune_flag_list "marvell-pj4") + list(APPEND _march_flag_list "armv7-a") + list(APPEND _available_extension_list "mp" "sec" "fp") + + # Intel + elseif(TARGET_ARCHITECTURE STREQUAL "i80200") + list(APPEND _mtune_flag_list "i80200") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250a") + list(APPEND _mtune_flag_list "pxa250a") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210a") + list(APPEND _mtune_flag_list "pxa210a") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400") + list(APPEND _mtune_flag_list "i80321-400") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600") + list(APPEND _mtune_flag_list "i80321-600") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250b") + list(APPEND _mtune_flag_list "pxa250b") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210b") + list(APPEND _mtune_flag_list "pxa210b") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-400-b0") + list(APPEND _mtune_flag_list "i80321-400-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "i80321-600-b0") + list(APPEND _mtune_flag_list "i80321-600-b0") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa250c") + list(APPEND _mtune_flag_list "pxa250c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa210c") + list(APPEND _mtune_flag_list "pxa210c") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa27x") + list(APPEND _mtune_flag_list "pxa27x") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-533") + list(APPEND _mtune_flag_list "ipx425-533") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-400") + list(APPEND _mtune_flag_list "ipx425-400") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx425-266") + list(APPEND _mtune_flag_list "ipx425-266") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa32x") + list(APPEND _mtune_flag_list "pxa32x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa930") + list(APPEND _mtune_flag_list "pxa930") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa30x") + list(APPEND _mtune_flag_list "pxa30x") + elseif(TARGET_ARCHITECTURE STREQUAL "pxa31x") + list(APPEND _mtune_flag_list "pxa31x") + elseif(TARGET_ARCHITECTURE STREQUAL "sa1110") + list(APPEND _mtune_flag_list "sa1110") + elseif(TARGET_ARCHITECTURE STREQUAL "ipx1200") + list(APPEND _mtune_flag_list "ipx1200") + + # Apple + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a6") + list(APPEND _mtune_flag_list "apple-a6") + list(APPEND _march_flag_list "armv7-a") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") + list(APPEND _mtune_flag_list "apple-a7") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") + list(APPEND _mtune_flag_list "apple-a8") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") + list(APPEND _mtune_flag_list "apple-a9") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") + list(APPEND _mtune_flag_list "apple-a10") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") + list(APPEND _mtune_flag_list "apple-a11") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") + list(APPEND _mtune_flag_list "apple-a12") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") + list(APPEND _mtune_flag_list "apple-a13") + list(APPEND _march_flag_list "armv8.4-a") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") + list(APPEND _mtune_flag_list "vortex") + list(APPEND _march_flag_list "armv8.3-a") + list(APPEND _march_flag_list "armv8.2-a") + list(APPEND _march_flag_list "armv8.1-a") + list(APPEND _march_flag_list "armv8-a") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + + # Others + elseif(TARGET_ARCHITECTURE STREQUAL "generic") + list(APPEND _march_flag_list "generic") + elseif(TARGET_ARCHITECTURE STREQUAL "none") + # add this clause to remove it from the else clause + + else() + message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") + endif() + + list(SORT _available_extension_list) + list(REMOVE_DUPLICATES _available_extension_list) + + if(OFA_VERBOSE) + if(_march_flag_list) + string(REPLACE ";" ", " _str "${_march_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU architectures (-march): " ${_str}) endif() - set(USE_${_name} ${_found} CACHE BOOL "${documentation}" ${_force}) - mark_as_advanced(USE_${_name}) - if(USE_${_name}) - list(APPEND _enable_vector_unit_list "${_flag}") - else() - list(APPEND _disable_vector_unit_list "${_flag}") + if(_mtune_flag_list) + string(REPLACE ";" ", " _str "${_mtune_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] CPU microarchitectures (-mtune): " ${_str}) endif() - endmacro() - - # Enable/disable features - _enable_or_disable(AES "aes" "Use AES. This will enable the aes and pmull crypto extension." _aes_broken) - _enable_or_disable(BF16 "bf16" "Use BF16. This will enable the brain half-precision floating-point instructions." _bf16_broken) - _enable_or_disable(CRC "crc" "Use CRC. This will enable the Cyclic Redundancy Check (CRC) instructions." _crc_broken) - _enable_or_disable(CRYPTO "crypto" "Use CRYPTO. This will enable the cryptographic instructions." _crypto_broken) - _enable_or_disable(DOTPROD "dotprod" "Use DOTPROD. This will enable the Dot Product extension. This also enables Advanced SIMD instructions." _dotprod_broken) - _enable_or_disable(DSP "dsp" "Use DSP. This will enable the DSP instruction." _dsp_broken) - _enable_or_disable(FP "fp" "Use FP. This will enable the floating-point data processing instructions." _fp_broken) - _enable_or_disable(FP16 "fp16" "Use FP16. This will enable the half-precision floating-point data processing instructions." _fp16_broken) - _enable_or_disable(FP16FML "fp16fml" "Use FP16FML. This will enable the FP16 fmla extension." _fp16fml_broken) - _enable_or_disable(FP_DP "fp.dp" "Use FP.DP. This will enable the single- and double-precision floating-point instructions." _fp_dp_broken) - _enable_or_disable(FP_SP "fp.sp" "Use FP.SP. This will enable the single-precision floating-point instructions." _fp_sp_broken) - _enable_or_disable(I8MM "i8mm" "Use I8MM. This will enable the 8-bit Integer Matrix Multiply instructions." _i8mm_broken) - _enable_or_disable(IDIV "idiv" "Use IDIV. This will enable the ARM-state integer division instructions." _idiv_broken) - _enable_or_disable(LSE "lse" "Use LSE. This will enable the Large System Extension instructions." _lse_broken) - _enable_or_disable(MVE "mve" "Use MVE. This will enable the M-Profile Vector Extension (MVE) integer instructions." _mve_broken) - _enable_or_disable(MVE_FP "mve.fp" "Use MVE.FP. This will enable the M-Profile Vector Extension (MVE) integer and single precision floating-point instructions." _mve_fp_broken) - _enable_or_disable(NEON "neon" "Use NEON. This will enable the Advanced SIMD (Neon) v1." _neon_broken) - _enable_or_disable(NEON_FP16 "neon-fp16" "Use NEON-FP16. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions, with the half-precision floating-point conversion operations." _neon_fp16_broken) - _enable_or_disable(NEON_VFPV4 "neon-vfpv4" "Use NEON-VFPV4. This will enable the Advanced SIMD (Neon) v2 and the VFPv4 floating-point instructions." _neon_vfpv4_broken) - _enable_or_disable(RAS "ras" "Use RAS. This will enable the Reliability, Availability, and Serviceability extension." _ras_broken) - _enable_or_disable(RCPC "rcpc" "Use RCPC. This will enable the RcPc extension." _rcpc_broken) - _enable_or_disable(RDM "rdm" "Use RDM. This will enable the RDM extension." _rdm_broken) - _enable_or_disable(RDMA "rdma" "Use RDMA. This will enable the Round Double Multiply Accumulate instructions." _rdma_broken) - _enable_or_disable(SEC "sec" "Use SEC. This will enable the security extension." _sec_broken) - _enable_or_disable(SHA2 "sha2" "Use SHA2. This will enable the sha2 crypto extension." _sha2_broken) - _enable_or_disable(SHA3 "sha3" "Use SHA3. This will enable the sha512 and sha3 crypto extension." _sha3_broken) - _enable_or_disable(SIMD "simd" "Use SIMD. This will enable the Advanced SIMD (Neon) v1 and the VFPv3 floating-point instructions." _simd_broken) - _enable_or_disable(SM4 "sm4" "Use SM4. This will enable the the sm3 and sm4 crypto extension." _sm4_broken) - _enable_or_disable(SVE "sve" "Use SVE. This will enable the Scalable Vector Extension instructions." _sve_broken) - _enable_or_disable(VFPV3 "vfpv3" "Use VPFV3. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers." _vfpv3_broken) - _enable_or_disable(VFPV3_D16 "vfpv3-d16" "Use VPFV3-16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_broken) - _enable_or_disable(VFPV3_D16_FP16 "vfpv3-d16-fp16" "Use VPFV3-D16-FP16. This will enable VFPv3 floating-point instructions, with 16 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_d16_fp16_broken) - _enable_or_disable(VFPV3_FP16 "vfpv3-fp16" "Use VPFV3-FP16. This will enable the VFPv3 floating-point instructions, with 32 double-precision registers and the half-precision floating-point conversion operations." _vfpv3_fp16_broken) - _enable_or_disable(VFPV4 "vfpv4" "Use VPFV4. This will enable the VFPv4 floating-point instructions, with 32 double-precision registers." _vfpv4_broken) - _enable_or_disable(VFPV4_D16 "vfpv4-d16" "Use VPFV4-D16. This will enable the VFPv4 floating-point instructions, with 16 double-precision registers." _vfpv4_dp16_broken) - _enable_or_disable(ZCM "zcm" "Use ZCM. This will enable the ZCM extension." _zcm_broken) - _enable_or_disable(ZCZ "zcz" "Use ZCZ. This will enable the ZCZ extension." _zcz_broken) - - # Add compiler flags - if(MSVC AND MSVC_VERSION GREATER 1900) - _ofa_find(_enable_vector_unit_list "vfpv4" _found) - if(_found) - AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + if(_available_extension_list) + string(REPLACE ";" ", " _str "${_available_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (available): ${_str}") endif() - if(NOT _found) - _ofa_find(_enable_vector_unit_list "simd" _found) - if(_found) - AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) - endif() + endif() + + # Following the recommendation from + # https://community.arm.com/developer/tools-software/ + # tools/b/tools-software-ides-blog/posts/ + # compiler-flags-across-architectures-march-mtune-and-mcpu we + # first try to use the -mcpu flag and set it a value from the + # list of -mtune flags. If that fails, e.g., if the compiler + # does not yet support the specified target, we try to set the + # -march and -mtune flags as fallback option. + + # Set compiler-specific option names + set(_mcpu_flag "-mcpu=") + set(_march_flag "-march=") + set(_mtune_flag "-mtune") + + foreach(_flag ${_mtune_flag_list}) + AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + if(_ok) + break() endif() - foreach(_flag ${_enable_vector_unit_list}) - string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") - add_definitions("-D${_flag}") - endforeach(_flag) - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") - - # TODO: Add Cray flags - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") - - # TODO: Add Fujitsu flags - - elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") - - # TODO: Add NVHPC flags - - else() - # Others: GNU, Clang and variants + endforeach() - # Following the recommendation from - # https://community.arm.com/developer/tools-software/tools/b/tools-software-ides-blog/posts/compiler-flags-across-architectures-march-mtune-and-mcpu - # we first try to only use the -mcpu flag. If that fails, e.g., if - # the compiler does not yet support the specified target, we try to - # set the -march and -mtune flags as fallback option. - foreach(_flag ${_mtune_flag_list}) - AddCompilerFlag("-mcpu=${_flag}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) + if(NOT _ok) + # Fallback: set -march and -mtune flags + set(_check_extension_list) + set(_enable_extension_list) + + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("${_march_flag}${_flag}" RESULT _ok) + if(_ok) + set(_march ${_flag}) break() - endif(_good) - endforeach(_flag) - - # Fallback: set -march, -mtune flags - if(NOT _good) - # Set -march flag - foreach(_march ${_march_flag_list}) - AddCompilerFlag("-march=${_march}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) + endif() + endforeach() + + # Step 2: Enable/disable feature flags based on available CPU + # features, used-defined USE_ variables and + # the capabilities of the host system's compiler and linker + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksArm.txt _checks) + string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") + string(REPLACE "\n" ";" _checks "${_checks}") + + set(_skip_check FALSE) + + # Iterate over the list of checks line by line + foreach (_check ${_checks}) + string(REPLACE "|" ";" _check "${_check}") + + # Parse for special lines + if ("${_check}" MATCHES "^#" ) # Skip comment + continue() + + elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block + list(GET _check 1 _push_enable_list) + _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check FALSE) + else() + list(PREPEND _skip_check TRUE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block + list(POP_FRONT _skip_check) + continue() + + elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block + list(GET _check 1 _push_disable_list) + _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check TRUE) + else() + list(PREPEND _skip_check FALSE) + endif() + continue() + + elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block + list(POP_FRONT _skip_check) + continue() + endif() + + # Skip test? + list(GET _skip_check 0 _skip) + if(_skip) + continue() + endif() + + # Extract extra CPU extensions, header files, function name, and parameters + list(GET _check 0 _check_extension_flags) + list(GET _check 1 _check_headers) + list(GET _check 2 _check_function) + list(GET _check 3 _check_params) + + # Convert list of extensions into compiler flags + string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") + list(GET _check_extension_flags 0 _extension_flag) + string(REPLACE ";" "+" _check_flags "${_check_extension_flags}") + list(APPEND _check_extension_flag_list "${_extension_flag}") + + # Extract optional extension alias + list(LENGTH _check _len) + if(${_len} EQUAL 5) + list(GET _check 4 _extension) + else() + set(_extension "${_extension_flag}") + endif() + + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension_flag> variable + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + + # If not specified externally, set the value of the + # USE_<_extension_flag> variable to TRUE if it is found in the list + # of available extensions and FALSE otherwise + if(NOT DEFINED ${_useVar}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(${_useVar} ${_found}) + endif() + + if(${_useVar}) + # Check if the compiler supports the -march=<_march>+<_extension_flag> + # flag and can compile the provided test code with it + set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") + message(${_code}) + message("${_march_flag}${_march}+${_extension_flag}") + AddCXXCompilerFlag("${_march_flag}${_march}+${_extension_flag}" + HEADERS ${_check_headers} + CODE "${_code}" + RESULT _ok) + if(NOT ${_ok}) + # Test failed + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + else() + # Test succeeded + set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") + endif() + else() + # Disable extension without running tests + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + endif() + mark_as_advanced(${_useVar}) + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_flag_list) + foreach(_extension_flag ${_check_extension_flag_list}) + _ofa_find(_available_extension_list "${_extension_flag}" _found) + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + + if(${_useVar}) + # Add <_extension_flag> to list of enabled extensions (if supported) + set(_haveVar "HAVE_${_march_flag}${_march}+${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_march_flag}${_march}+${_extension_flag} because checks failed") + endif() + continue() + endif() + list(APPEND _enable_extension_list "${_extension_flag}") + endif() + endforeach() + + if(OFA_VERBOSE) + # Print enabled extension flags + if(_enable_extension_list) + list(SORT _enable_extension_list) + string(REPLACE ";" ", " _str "${_enable_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (enabled): ${_str}") + endif() + # Print unhandled extension flags + set(_unhandled_extension_list) + foreach(_extension ${_available_extension_list}) + _ofa_find(_check_extension_list "${_extension}" _found) + if(NOT _found) + list(APPEND _unhandled_extension_list ${_extension}) + endif() + endforeach() + if(_unhandled_extension_list) + list(SORT _unhandled_extension_list) + string(REPLACE ";" ", " _str "${_unhandled_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (unhandled): ${_str}") + endif() + endif() + + # Step 3: Set compiler-specific flags (e.g., -m/-mno-) + if(MSVC AND MSVC_VERSION GREATER 1900) + _ofa_find(_enable_extension_list "vfpv4" _found) + if(_found) + AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + if(NOT _found) + _ofa_find(_enable_extension_list "simd" _found) + if(_found) + AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) + endif() + endif() + foreach(_flag ${_enable_extension_list}) + string(TOUPPER "${_flag}" _flag) + string(REPLACE "." "_" _flag "__${_flag}__") + add_definitions("-D${_flag}") + endforeach(_flag) + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") + + # TODO: Add Cray flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Fujitsu") + + # TODO: Add Fujitsu flags + + elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC") + + # TODO: Add NVHPC flags + + else() + # Others: GNU, Clang and variants + foreach(_march ${_march_flag_list}) + AddCXXCompilerFlag("-march=${_march}" RESULT _ok) + if(_ok) set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_vector_unit_list}) - AddCompilerFlag("-march=${_march_plus_extensions}+${_flag}" CXX_RESULT _good CXX_FLAGS DUMMY_FLAGS) - if(_good) + foreach(_flag ${_enable_extension_list}) + AddCXXCompilerFlag("-march=${_march_plus_extensions}+${_flag}" RESULT _ok) + if(_ok) set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") - endif(_good) - endforeach(_flag) - AddCompilerFlag("-march=${_march_plus_extensions}" CXX_FLAGS OFA_ARCHITECTURE_FLAGS) + endif(_ok) + endforeach() + AddCXXCompilerFlag("-march=${_march_plus_extensions}" FLAGS OFA_ARCHITECTURE_FLAGS) break() - endif(_good) - endforeach(_march) + endif() + endforeach() # Set -mtune flag foreach(_mtune ${_mtune_flag_list}) - AddCompilerFlag("-mtune=${_mtune}" CXX_RESULT _good CXX_FLAGS OFA_ARCHITECTURE_FLAGS) - if(_good) + AddCXXCompilerFlag("-mtune=${_mtune}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + if(_ok) break() - endif(_good) - endforeach(_mtune) - endif(NOT _good) - - # Note that ARM does not support -mfeature and -mno-feature to - # enable and disable specific features. Hence, there are no - # loops over the _enable_vector_unit_list and - # _disable_vector_unit_list lists here(!) + endif() + endforeach() + endif() + endif() endif() diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index dd75b1df03..df32918395 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -22,32 +22,33 @@ macro(OFA_HandleX86Options) if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) - message(FATAL_ERROR, "[OFA] MSVC does not support \"native\" flag.") + message(FATAL_ERROR "[OFA] MSVC does not support \"native\" flag.") elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") if(WIN32) # Intel (on Windows) - AddCXXCompilerFlag("/QxHOST" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("/QxHOST" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) else() # Intel (on Linux) - AddCXXCompilerFlag("-xHOST" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-xHOST" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) endif() elseif(CMAKE_CXX_COMPILER_ID MATCHES "NVHPC" OR CMAKE_CXX_COMPILER_ID MATCHES "PGI") - # NVidia HPC / PGI (on Linux/Windows - AddCXXCompilerFlag("-tp=native" FLAGS OFA_ARCHITECTURE_FLAGS) + # NVidia HPC / PGI (on Linux/Windows) + AddCXXCompilerFlag("-tp=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") # Sun/Oracle Studio (on Linux/Sun OS) - AddCXXCompilerFlag("-native" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("-native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) elseif(CMAKE_CXX_COMPILER_ID MATCHES "Cray") # Cray (on Linux) message(FATAL_ERROR, "[OFA] Cray compiler does not support \"native\" flag.") else() # Others: GNU, Clang and variants - _ofa_find(OFA_ARCHITECTURE_FLAGS "-march=native" _found) - if(NOT _found) - AddCXXCompilerFlag("-march=native" FLAGS OFA_ARCHITECTURE_FLAGS) - endif() + AddCXXCompilerFlag("-march=native" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + endif() + + if(NOT _ok) + message(FATAL_ERROR "[OFA] An error occured while setting the \"native\" flag.") endif() elseif(NOT TARGET_ARCHITECTURE STREQUAL "none") @@ -344,6 +345,7 @@ macro(OFA_HandleX86Options) message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() + # Clean list of available extensions list(SORT _available_extension_list) list(REMOVE_DUPLICATES _available_extension_list) @@ -360,364 +362,366 @@ macro(OFA_HandleX86Options) endif() endif() - if(NOT TARGET_ARCHITECTURE STREQUAL "none") - set(_check_extension_list) - set(_check_extension_flag_list) - set(_disable_extension_flag_list) - set(_enable_extension_flag_list) + set(_check_extension_list) + set(_check_extension_flag_list) + set(_disable_extension_flag_list) + set(_enable_extension_flag_list) - # Set enable/disable compiler flag prefixes - if(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") - set(_enable "-xarch=") - set(_disable "-xarch=no-") - else() - set(_enable "-m") - set(_disable "-mno-") - endif() - - # Step 2: Enable/disable feature flags based on available CPU - # features, used-defined USE_ variables and - # the capabilities of the host system's compiler and linker - file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksX86.txt _checks) - string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") - string(REPLACE "\n" ";" _checks "${_checks}") - - set(_skip_check FALSE) - - # Iterate over the list of checks line by line - foreach (_check ${_checks}) - string(REPLACE "|" ";" _check "${_check}") - - # Parse for special lines - if ("${_check}" MATCHES "^#" ) # Skip comment - continue() - - elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block - list(GET _check 1 _push_enable_list) - _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) - if(_found) - list(PREPEND _skip_check FALSE) - else() - list(PREPEND _skip_check TRUE) - endif() - continue() - - elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block - list(POP_FRONT _skip_check) - continue() - - elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block - list(GET _check 1 _push_disable_list) - _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) - if(_found) - list(PREPEND _skip_check TRUE) - else() - list(PREPEND _skip_check FALSE) - endif() - continue() - - elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block - list(POP_FRONT _skip_check) - continue() - endif() + # Set compiler-specific option names + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + set(_enable_flag "/arch:") + unset(_disable) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + set(_enable_flag "-xarch=") + unset(_disable_flag) + else() + set(_enable_flag "-m") + set(_disable_flag "-mno-") + endif() - # Skip test? - list(GET _skip_check 0 _skip) - if(_skip) - continue() - endif() + # Step 2: Enable/disable feature flags based on available CPU + # features, used-defined USE_ variables and + # the capabilities of the host system's compiler and linker + file(READ ${CMAKE_SOURCE_DIR}/cmake/ofa/ChecksX86.txt _checks) + string(REGEX REPLACE "[:;]" "|" _checks "${_checks}") + string(REPLACE "\n" ";" _checks "${_checks}") - # Extract extra CPU extensions, header files, function name, and parameters - list(GET _check 0 _check_extension_flags) - list(GET _check 1 _check_headers) - list(GET _check 2 _check_function) - list(GET _check 3 _check_params) - - # Convert list of extensions into compiler flags - string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") - list(GET _check_extension_flags 0 _extension_flag) - string(REPLACE ";" " ${_enable}" _check_flags "${_enable}${_check_extension_flags}") - list(APPEND _check_extension_flag_list "${_extension_flag}") - - # Extract optional extension alias - list(LENGTH _check _len) - if(${_len} EQUAL 5) - list(GET _check 4 _extension) - else() - set(_extension "${_extension_flag}") - endif() + set(_skip_check FALSE) - list(APPEND _check_extension_list "${_extension}") - - # Define USE_<_extension_flag> variable - set(_useVar "USE_${_extension_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - - # If not specified externally, set the value of the - # USE_<_extension_flag> variable to TRUE if it is found in the list - # of available extensions and FALSE otherwise - if(NOT DEFINED ${_useVar}) - _ofa_find(_available_extension_list "${_extension}" _found) - set(${_useVar} ${_found}) - endif() - - if(${_useVar}) - # Check if the compiler supports the -m<_extension_flag> - # flag and can compile the provided test code with it - set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") - AddCXXCompilerFlag("${_enable}${_extension_flag}" - EXTRA_FLAGS ${_check_flags} - HEADERS ${_check_headers} - CODE "${_code}" - RESULT _ok) - if(NOT ${_ok}) - # Test failed - set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") - else() - # Test succeeded - set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") - endif() + # Iterate over the list of checks line by line + foreach (_check ${_checks}) + string(REPLACE "|" ";" _check "${_check}") + + # Parse for special lines + if ("${_check}" MATCHES "^#" ) # Skip comment + continue() + + elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block + list(GET _check 1 _push_enable_list) + _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check FALSE) else() - # Disable extension without running tests - set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + list(PREPEND _skip_check TRUE) endif() - mark_as_advanced(${_useVar}) - endforeach() + continue() - # Generate lists of enabled/disabled flags - list(REMOVE_DUPLICATES _check_extension_flag_list) - foreach(_extension_flag ${_check_extension_flag_list}) - _ofa_find(_available_extension_list "${_extension_flag}" _found) - set(_useVar "USE_${_extension_flag}") - string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") - - if(${_useVar}) - # Add <_extension_flag> to list of enabled extensions (if supported) - set(_haveVar "HAVE_${_enable}${_extension_flag}") - string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") - if(NOT ${_haveVar}) - if(OFA_VERBOSE) - message(STATUS "[OFA] Ignoring flag ${_enable}${_extension_flag} because checks failed") - endif() - continue() - endif() - list(APPEND _enable_extension_flag_list "${_extension_flag}") + elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block + list(POP_FRONT _skip_check) + continue() + + elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block + list(GET _check 1 _push_disable_list) + _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) + if(_found) + list(PREPEND _skip_check TRUE) else() - # Add <_extension_flag> to list of disabled extensions (if supported) - AddCXXCompilerFlag("${_disable}${_extension_flag}") - set(_haveVar "HAVE_${_disable}${_extension_flag}") - string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") - if(NOT ${_haveVar}) - if(OFA_VERBOSE) - message(STATUS "[OFA] Ignoring flag ${_disable}${_extension_flag} because checks failed") - endif() - continue() - endif() - list(APPEND _disable_extension_flag_list "${_extension_flag}") + list(PREPEND _skip_check FALSE) endif() - endforeach() + continue() + + elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block + list(POP_FRONT _skip_check) + continue() + endif() + + # Skip test? + list(GET _skip_check 0 _skip) + if(_skip) + continue() + endif() + + # Extract extra CPU extensions, header files, function name, and parameters + list(GET _check 0 _check_extension_flags) + list(GET _check 1 _check_headers) + list(GET _check 2 _check_function) + list(GET _check 3 _check_params) + + # Convert list of extensions into compiler flags + string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") + list(GET _check_extension_flags 0 _extension_flag) + string(REPLACE ";" " ${_enable_flag}" _check_flags "${_enable_flag}${_check_extension_flags}") + list(APPEND _check_extension_flag_list "${_extension_flag}") + + # Extract optional extension alias + list(LENGTH _check _len) + if(${_len} EQUAL 5) + list(GET _check 4 _extension) + else() + set(_extension "${_extension_flag}") + endif() + + list(APPEND _check_extension_list "${_extension}") + + # Define USE_<_extension_flag> variable + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") - if(OFA_VERBOSE) - # Print enabled extension flags - if(_enable_extension_flag_list) - list(SORT _enable_extension_flag_list) - string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (enabled): ${_str}") + # If not specified externally, set the value of the + # USE_<_extension_flag> variable to TRUE if it is found in the list + # of available extensions and FALSE otherwise + if(NOT DEFINED ${_useVar}) + _ofa_find(_available_extension_list "${_extension}" _found) + set(${_useVar} ${_found}) + endif() + + if(${_useVar}) + # Check if the compiler supports the -m<_extension_flag> + # flag and can compile the provided test code with it + set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") + AddCXXCompilerFlag("${_enable_flag}${_extension_flag}" + EXTRA_FLAGS ${_check_flags} + HEADERS ${_check_headers} + CODE "${_code}" + RESULT _ok) + if(NOT ${_ok}) + # Test failed + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + else() + # Test succeeded + set(${_useVar} TRUE CACHE BOOL "Use ${_extension} extension.") endif() - # Print disabled extension flags - if(_disable_extension_flag_list) - list(SORT _disable_extension_flag_list) - string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (disabled): ${_str}") + else() + # Disable extension without running tests + set(${_useVar} FALSE CACHE BOOL "Use ${_extension} extension.") + endif() + mark_as_advanced(${_useVar}) + endforeach() + + # Generate lists of enabled/disabled flags + list(REMOVE_DUPLICATES _check_extension_flag_list) + foreach(_extension_flag ${_check_extension_flag_list}) + _ofa_find(_available_extension_list "${_extension_flag}" _found) + set(_useVar "USE_${_extension_flag}") + string(TOUPPER "${_useVar}" _useVar) + string(REPLACE "." "_" _useVar "${_useVar}") + + if(${_useVar}) + # Add <_extension_flag> to list of enabled extensions (if supported) + set(_haveVar "HAVE_${_enable_flag}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_enable_flag}${_extension_flag} because checks failed") + endif() + continue() endif() - # Print unhandled extension flags - set(_unhandled_extension_list) - foreach(_extension ${_available_extension_list}) - _ofa_find(_check_extension_list "${_extension}" _found) - if(NOT _found) - list(APPEND _unhandled_extension_list ${_extension}) + list(APPEND _enable_extension_flag_list "${_extension_flag}") + elseif(DEFINED _disable_flag) + # Add <_extension_flag> to list of disabled extensions (if supported) + AddCXXCompilerFlag("${_disable_flag}${_extension_flag}") + set(_haveVar "HAVE_${_disable_flag}${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_disable_flag}${_extension_flag} because checks failed") endif() - endforeach() - if(_unhandled_extension_list) - list(SORT _unhandled_extension_list) - string(REPLACE ";" ", " _str "${_unhandled_extension_list}") - string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (unhandled): ${_str}") + continue() endif() + list(APPEND _disable_extension_flag_list "${_extension_flag}") endif() + endforeach() - # Step 3: Set compiler-specific flags (e.g., -m/-mno-) - if(MSVC AND MSVC_VERSION GREATER 1700) - _ofa_find(_enable_extension_flag_list "avx512f" _found) - if(_found) - AddCXXCompilerFlag("/arch:AVX512" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) - endif() + if(OFA_VERBOSE) + # Print enabled extension flags + if(_enable_extension_flag_list) + list(SORT _enable_extension_flag_list) + string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (enabled): ${_str}") + endif() + # Print disabled extension flags + if(_disable_extension_flag_list) + list(SORT _disable_extension_flag_list) + string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (disabled): ${_str}") + endif() + # Print unhandled extension flags + set(_unhandled_extension_list) + foreach(_extension ${_available_extension_list}) + _ofa_find(_check_extension_list "${_extension}" _found) if(NOT _found) - _ofa_find(_enable_extension_flag_list "avx2" _found) - if(_found) - AddCXXCompilerFlag("/arch:AVX2" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) - endif() + list(APPEND _unhandled_extension_list ${_extension}) endif() - if(NOT _found) - _ofa_find(_enable_extension_flag_list "avx" _found) - if(_found) - AddCXXCompilerFlag("/arch:AVX" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) - endif() + endforeach() + if(_unhandled_extension_list) + list(SORT _unhandled_extension_list) + string(REPLACE ";" ", " _str "${_unhandled_extension_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (unhandled): ${_str}") + endif() + endif() + + # Step 3: Set compiler-specific flags (e.g., -m/-mno-) + if(MSVC AND MSVC_VERSION GREATER 1700) + _ofa_find(_enable_extension_flag_list "avx512f" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX512" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "avx2" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX2" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() - if(NOT _found) - _ofa_find(_enable_extension_flag_list "sse2" _found) - if(_found) - AddCXXCompilerFlag("/arch:SSE2" FLAGS OFA_ARCHITECTURE_FLAGS) - endif() + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "avx" _found) + if(_found) + AddCXXCompilerFlag("/arch:AVX" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _found) endif() - if(NOT _found) - _ofa_find(_enable_extension_flag_list "sse" _found) - if(_found) - AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) - endif() + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "sse2" _found) + if(_found) + AddCXXCompilerFlag("/arch:SSE2" FLAGS OFA_ARCHITECTURE_FLAGS) endif() - foreach(_extension ${_enable_extension_flag_list}) - string(TOUPPER "${_extension}" _extension) - string(REPLACE "." "_" _extension "__${_extension}__") - add_definitions("-D${_extension}") - endforeach(_extension) + endif() + if(NOT _found) + _ofa_find(_enable_extension_flag_list "sse" _found) + if(_found) + AddCXXCompilerFlag("/arch:SSE" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() + foreach(_extension ${_enable_extension_flag_list}) + string(TOUPPER "${_extension}" _extension) + string(REPLACE "." "_" _extension "__${_extension}__") + add_definitions("-D${_extension}") + endforeach(_extension) - elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" - OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel" + OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") - if(WIN32) - # Intel (on Windows) - set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") - set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") - set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") - set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") - set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") - set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") - set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") - set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") - set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") - set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") - set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") - set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") - set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") - set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") - set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") - set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") - set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") - set(OFA_map_westmere "-QxSSE4.2") - set(OFA_map_nehalem "-QxSSE4.2") - set(OFA_map_penryn "-QxSSSE3") - set(OFA_map_merom "-QxSSSE3") - set(OFA_map_core2 "-QxSSE3") - set(_ok FALSE) - else() - # Intel (in Linux) - set(OFA_map_knl "-xKNL;-xMIC-AVX512") - set(OFA_map_knm "-xKNM;-xMIC-AVX512") - set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") - set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") - set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") - set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") - set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") - set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") - set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") - set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") - set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") - set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") - set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") - set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") - set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") - set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") - set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") - set(OFA_map_westmere "-xSSE4.2") - set(OFA_map_nehalem "-xSSE4.2") - set(OFA_map_penryn "-xSSSE3") - set(OFA_map_merom "-xSSSE3") - set(OFA_map_core2 "-xSSE3") - set(_ok FALSE) - endif() + if(WIN32) + # Intel (on Windows) + set(OFA_map_knl "-QxKNL;-QxMIC-AVX512") + set(OFA_map_knm "-QxKNM;-QxMIC-AVX512") + set(OFA_map_rocketlake "-QxROCKETLAKE;-QxCORE-AVX512") + set(OFA_map_sapphirerapids "-QxSAPPHIRERAPIDS;-QxCORE-AVX512") + set(OFA_map_alderlake "-QxALDERLAKE;-QxCORE-AVX512") + set(OFA_map_tigerlake "-QxTIGERLAKE;-QxCORE-AVX512") + set(OFA_map_icelake-server "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-avx512 "-QxICELAKE-SERVER;-QxCORE-AVX512") + set(OFA_map_icelake-client "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_icelake "-QxICELAKE-CLIENT;-QxCORE-AVX512") + set(OFA_map_cannonlake "-QxCANNONLAKE;-QxCORE-AVX512") + set(OFA_map_cooperlake "-QxCOOPERLAKE;-QxCORE-AVX512") + set(OFA_map_cascadelake "-QxCASCADELAKE;-QxCORE-AVX512") + set(OFA_map_skylake-avx512 "-QxSKYLAKE-AVX512;-QxCORE-AVX512") + set(OFA_map_skylake "-QxSKYLAKE;-QxCORE-AVX2") + set(OFA_map_broadwell "-QxBROADWELL;-QxCORE-AVX2") + set(OFA_map_haswell "-QxHASWELL;-QxCORE-AVX2") + set(OFA_map_ivybridge "-QxIVYBRIDGE;-QxCORE-AVX-I") + set(OFA_map_sandybridge "-QxSANDYBRIDGE;-QxAVX") + set(OFA_map_westmere "-QxSSE4.2") + set(OFA_map_nehalem "-QxSSE4.2") + set(OFA_map_penryn "-QxSSSE3") + set(OFA_map_merom "-QxSSSE3") + set(OFA_map_core2 "-QxSSE3") + set(_ok FALSE) + else() + # Intel (in Linux) + set(OFA_map_knl "-xKNL;-xMIC-AVX512") + set(OFA_map_knm "-xKNM;-xMIC-AVX512") + set(OFA_map_rocketlake "-xROCKETLAKE;-xCORE-AVX512") + set(OFA_map_sapphirerapids "-xSAPPHIRERAPIDS;-xCORE-AVX512") + set(OFA_map_alderlake "-xALDERLAKE;-xCORE-AVX512") + set(OFA_map_tigerlake "-xTIGERLAKE;-xCORE-AVX512") + set(OFA_map_icelake-server "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-avx512 "-xICELAKE-SERVER;-xCORE-AVX512") + set(OFA_map_icelake-client "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_icelake "-xICELAKE-CLIENT;-xCORE-AVX512") + set(OFA_map_cannonlake "-xCANNONLAKE;-xCORE-AVX512") + set(OFA_map_cooperlake "-xCOOPERLAKE;-xCORE-AVX512") + set(OFA_map_cascadelake "-xCASCADELAKE;-xCORE-AVX512") + set(OFA_map_skylake-avx512 "-xSKYLAKE-AVX512;-xCORE-AVX512") + set(OFA_map_skylake "-xSKYLAKE;-xCORE-AVX2") + set(OFA_map_broadwell "-xBROADWELL;-xCORE-AVX2") + set(OFA_map_haswell "-xHASWELL;-xCORE-AVX2") + set(OFA_map_ivybridge "-xIVYBRIDGE;-xCORE-AVX-I") + set(OFA_map_sandybridge "-xSANDYBRIDGE;-xAVX") + set(OFA_map_westmere "-xSSE4.2") + set(OFA_map_nehalem "-xSSE4.2") + set(OFA_map_penryn "-xSSSE3") + set(OFA_map_merom "-xSSSE3") + set(OFA_map_core2 "-xSSE3") + set(_ok FALSE) + endif() - foreach(_arch ${_march_flag_list}) - if(DEFINED OFA_map_${_arch}) - foreach(_flag ${OFA_map_${_arch}}) - AddCXXCompilerFlag(${_flag} FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) - if(_ok) - break() - endif() - endforeach() + foreach(_arch ${_march_flag_list}) + if(DEFINED OFA_map_${_arch}) + foreach(_flag ${OFA_map_${_arch}}) + AddCXXCompilerFlag(${_flag} FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) if(_ok) break() endif() + endforeach() + if(_ok) + break() endif() - endforeach() - if(NOT _ok) - # This is the Intel compiler, so SSE2 is a very reasonable baseline. - message(STATUS "[OFA] Did not recognize the requested architecture flag ${_arch}, falling back to SSE2") - if(WIN32) - AddCXXCompilerFlag("-QxSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) - else() - AddCXXCompilerFlag("-xSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) - endif() endif() + endforeach() + if(NOT _ok) + # This is the Intel compiler, so SSE2 is a very reasonable baseline. + message(STATUS "[OFA] Did not recognize the requested architecture flag ${_arch}, falling back to SSE2") + if(WIN32) + AddCXXCompilerFlag("-QxSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) + else() + AddCXXCompilerFlag("-xSSE2" FLAGS OFA_ARCHITECTURE_FLAGS) + endif() + endif() - # Set -m<_extension> flag for enabled features - foreach(_extension ${_enable_extension_flag_list}) - AddCXXCompilerFlag("${_enable}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_extension) + # Set -m<_extension> flag for enabled features + foreach(_extension ${_enable_extension_flag_list}) + AddCXXCompilerFlag("${_enable_flag}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_extension) - # Set -mno-<_extension> flag for disabled features + # Set -mno-<_extension> flag for disabled features + if(DEFINED _disable_flag) foreach(_extension ${_disable_extension_flag_list}) - AddCXXCompilerFlag("${_disable}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) + AddCXXCompilerFlag("${_disable_flag}${_extension}" FLAGS OFA_ARCHITECTURE_FLAGS) endforeach(_extension) + endif() - elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "SunPro") - # Set -xtarget flag - foreach(_flag ${_march_flag_list}) - AddCXXCompilerFlag("-xtarget=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) - if(_good) - break() - endif(_good) - endforeach(_flag) - - # Set -xarch= flag for enabled features - foreach(_flag ${_enable_extension_flag_list}) - AddCXXCompilerFlag("-xarch=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - - # TODO PGI/Cray .. - - else() - # Others: GNU, Clang and variants + # Set -xtarget flag + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("-xtarget=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) + if(_good) + break() + endif(_good) + endforeach(_flag) - # Set -march flag - foreach(_flag ${_march_flag_list}) - AddCXXCompilerFlag("-march=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) - if(_good) - break() - endif(_good) - endforeach(_flag) - - # Set -m flag for enabled features - foreach(_flag ${_enable_extension_flag_list}) - AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - - # Set -mno-feature flag for disabled features - foreach(_flag ${_disable_extension_flag_list}) - AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) - endforeach(_flag) - endif() - endif() + # Set -xarch= flag for enabled features + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-xarch=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + # TODO PGI/Cray ... + + else() + # Others: GNU, Clang and variants + + # Set -march flag + foreach(_flag ${_march_flag_list}) + AddCXXCompilerFlag("-march=${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _good) + if(_good) + break() + endif(_good) + endforeach(_flag) + + # Set -m flag for enabled features + foreach(_flag ${_enable_extension_flag_list}) + AddCXXCompilerFlag("-m${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + + # Set -mno-feature flag for disabled features + foreach(_flag ${_disable_extension_flag_list}) + AddCXXCompilerFlag("-mno-${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS) + endforeach(_flag) + endif() endif() # Compile code with profiling instrumentation @@ -736,6 +740,7 @@ macro(OFA_HandleX86Options) endif() endif() + # Remove duplicate flags list(REMOVE_DUPLICATES OFA_ARCHITECTURE_FLAGS) if(OFA_VERBOSE) From c1895d2d7ea144b47df4234602ecbcbb50af7dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 21 Jan 2022 19:16:11 +0100 Subject: [PATCH 146/174] Replaced benchmark namespace by utils --- examples/performance_benchmark.cpp | 18 +++++++++--------- src/gsIO/gsBenchmark.cpp | 8 ++++---- src/gsIO/gsBenchmark.h | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index ec29f998e0..ec971da6d9 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -1242,8 +1242,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_memcopy::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = benchmark::ratio("memcopyRatio", - "Memory copy (gsVector : native C array)", *bmB, *bmA); + auto bm = utils::ratio("memcopyRatio", + "Memory copy (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1253,8 +1253,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_dotproduct::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = benchmark::ratio("dotproductRatio", - "Dot product (gsVector : native C array)", *bmB, *bmA); + auto bm = utils::ratio("dotproductRatio", + "Dot product (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1264,8 +1264,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_axpy::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = benchmark::ratio("axpyRatio", - "AXPY (gsVector : native C array)", *bmB, *bmA); + auto bm = utils::ratio("axpyRatio", + "AXPY (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1275,9 +1275,9 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_dense_matmul::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = benchmark::ratio("densematmulRatio", - "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", - *bmB, *bmA); + auto bm = utils::ratio("densematmulRatio", + "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", + *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index 84d294413b..fa96fbe100 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -344,10 +344,10 @@ namespace benchmark { char id('A'); for (const auto& it : util::zip(objA.get(), objB.get())) { - results.push_back( give(benchmark::ratio(std::get<0>(it).get_label()+std::string(1,id++), - std::get<0>(it).get_descr(), - std::get<0>(it), - std::get<1>(it))) ); + results.push_back( give(utils::ratio(std::get<0>(it).get_label()+std::string(1,id++), + std::get<0>(it).get_descr(), + std::get<0>(it), + std::get<1>(it))) ); } gsBenchmarkSet benchmark(label, descr, give(results) ); diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 5dda7f6ca4..3a79434d1d 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -530,7 +530,7 @@ class gsXml< gsBenchmark > }; } // namespace internal -namespace benchmark { +namespace utils { /// \brief Returns the ratio of the two given benchmark result sets GISMO_EXPORT gsBenchmarkResultSet ratio(const std::string& label, @@ -543,6 +543,6 @@ namespace benchmark { const std::string& descr, const gsBenchmarkSet objA, const gsBenchmarkSet objB); -} // namespace benchmark +} // namespace utils } // namespace gismo From 3392061dc9f8b3bfd6b8c303ab08b75ff38c8f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 23 Jan 2022 19:38:27 +0100 Subject: [PATCH 147/174] Updated OFA --- cmake/ofa/HandleArmOptions.cmake | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index 20d5d56daa..bee29d25bd 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -693,33 +693,33 @@ macro(OFA_HandleArmOptions) elseif(TARGET_ARCHITECTURE STREQUAL "apple-a7") list(APPEND _mtune_flag_list "apple-a7") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a8") list(APPEND _mtune_flag_list "apple-a8") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a9") list(APPEND _mtune_flag_list "apple-a9") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crypto" "fp" "neon" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crypto" "fp" "simd" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a10") list(APPEND _mtune_flag_list "apple-a10") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "neon" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "simd" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a11") list(APPEND _mtune_flag_list "apple-a11") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "lse" "neon" "ras" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "lse" "simd" "ras" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a12") list(APPEND _mtune_flag_list "apple-a12") list(APPEND _march_flag_list "armv8.3-a") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-a13") list(APPEND _mtune_flag_list "apple-a13") list(APPEND _march_flag_list "armv8.4-a") @@ -727,14 +727,14 @@ macro(OFA_HandleArmOptions) list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crc" "crypto" "dotprod" "fp" "fp16" "fp16fml" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "sha3" "sm4" "zcm" "zcz") elseif(TARGET_ARCHITECTURE STREQUAL "apple-m1") list(APPEND _mtune_flag_list "vortex") list(APPEND _march_flag_list "armv8.3-a") list(APPEND _march_flag_list "armv8.2-a") list(APPEND _march_flag_list "armv8.1-a") list(APPEND _march_flag_list "armv8-a") - list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "neon" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") + list(APPEND _available_extension_list "aes" "crc" "crypto" "fp" "fp16" "lse" "simd" "ras" "rcpc" "rdm" "sha2" "zcm" "zcz") # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") From 3c2ac142e7d9fce0bdb21ce02c7a30843dc9d889 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 25 Jan 2022 15:45:59 +0100 Subject: [PATCH 148/174] Fixed bug in gsBenchmark --- examples/performance_benchmark.cpp | 18 +++++++++--------- src/gsIO/gsBenchmark.cpp | 12 ++++++------ src/gsIO/gsBenchmark.h | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index ec971da6d9..23d51646da 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -1242,8 +1242,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_memcopy::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = utils::ratio("memcopyRatio", - "Memory copy (gsVector : native C array)", *bmB, *bmA); + auto bm = util::ratio("memcopyRatio", + "Memory copy (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1253,8 +1253,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_dotproduct::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = utils::ratio("dotproductRatio", - "Dot product (gsVector : native C array)", *bmB, *bmA); + auto bm = util::ratio("dotproductRatio", + "Dot product (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1264,8 +1264,8 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_axpy::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = utils::ratio("axpyRatio", - "AXPY (gsVector : native C array)", *bmB, *bmA); + auto bm = util::ratio("axpyRatio", + "AXPY (gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } @@ -1275,9 +1275,9 @@ int main(int argc, char *argv[]) auto bmB = benchmark.find(benchmark_eigen_dense_matmul::label()); if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { - auto bm = utils::ratio("densematmulRatio", - "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", - *bmB, *bmA); + auto bm = util::ratio("densematmulRatio", + "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", + *bmB, *bmA); benchmark.get().push_back( give(bm) ); } } diff --git a/src/gsIO/gsBenchmark.cpp b/src/gsIO/gsBenchmark.cpp index fa96fbe100..28b9b015b1 100644 --- a/src/gsIO/gsBenchmark.cpp +++ b/src/gsIO/gsBenchmark.cpp @@ -309,7 +309,7 @@ namespace gismo return os; } -namespace benchmark { +namespace util { gsBenchmarkResultSet ratio(const std::string& label, const std::string& descr, @@ -344,15 +344,15 @@ namespace benchmark { char id('A'); for (const auto& it : util::zip(objA.get(), objB.get())) { - results.push_back( give(utils::ratio(std::get<0>(it).get_label()+std::string(1,id++), - std::get<0>(it).get_descr(), - std::get<0>(it), - std::get<1>(it))) ); + results.push_back( give(util::ratio(std::get<0>(it).get_label()+std::string(1,id++), + std::get<0>(it).get_descr(), + std::get<0>(it), + std::get<1>(it))) ); } gsBenchmarkSet benchmark(label, descr, give(results) ); return benchmark; } -} // namespace benchmark +} // namespace util } // namespace gismo diff --git a/src/gsIO/gsBenchmark.h b/src/gsIO/gsBenchmark.h index 3a79434d1d..0a771855e4 100644 --- a/src/gsIO/gsBenchmark.h +++ b/src/gsIO/gsBenchmark.h @@ -530,7 +530,7 @@ class gsXml< gsBenchmark > }; } // namespace internal -namespace utils { +namespace util { /// \brief Returns the ratio of the two given benchmark result sets GISMO_EXPORT gsBenchmarkResultSet ratio(const std::string& label, @@ -543,6 +543,6 @@ namespace utils { const std::string& descr, const gsBenchmarkSet objA, const gsBenchmarkSet objB); -} // namespace utils +} // namespace util } // namespace gismo From 088f6f62a7c80adcd7fbda7db325a915dd717f00 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 25 Jan 2022 18:29:02 +0100 Subject: [PATCH 149/174] Update OFA - x86 completed --- cmake/ofa/AddCXXCompilerFlag.cmake | 1 + cmake/ofa/ChecksX86.txt | 43 +++++++++++++++++++++--------- cmake/ofa/HandleX86Options.cmake | 43 ++++++++++++++++++++++++------ 3 files changed, 66 insertions(+), 21 deletions(-) diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake index 1ac2974e5e..3b41ddfa13 100644 --- a/cmake/ofa/AddCXXCompilerFlag.cmake +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -163,6 +163,7 @@ macro(AddCXXCompilerFlag _flag) # Some compilers do not fail with a bad flag FAIL_REGEX "error: bad value (.*) for .* switch" # GNU FAIL_REGEX "argument unused during compilation" # clang + FAIL_REGEX "warning: the flag .* has been deprecated" # clang FAIL_REGEX "is valid for .* but not for C\\\\+\\\\+" # GNU FAIL_REGEX "unrecognized .*option" # GNU FAIL_REGEX "ignored for target" # GNU diff --git a/cmake/ofa/ChecksX86.txt b/cmake/ofa/ChecksX86.txt index 2acc303ec9..64b418f4d7 100644 --- a/cmake/ofa/ChecksX86.txt +++ b/cmake/ofa/ChecksX86.txt @@ -59,7 +59,7 @@ # # pop_enable:SunPro -# MSVC and Oracle's SunPro compiler fails these checks +# MSVC and Oracle's SunPro compiler fail these checks push_disable:MSVC,SunPro # MMX @@ -106,15 +106,11 @@ amx-int8;immintrin.h;_tile_dpbssd;0,1,2 amx-tile;immintrin.h;_tile_zero;0 # Other -adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1];adcx +adx;immintrin.h;_addcarryx_u32;(unsigned char)0,(unsigned int)1,(unsigned int)1,new unsigned int[1] aes;wmmintrin.h;_mm_aesdec_si128;_mm_setzero_si128(),_mm_setzero_si128() bmi2;immintrin.h;_bzhi_u32;(unsigned int)1,(unsigned int)1 -bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 -cldemote;immintrin.h;_mm_cldemote;(void const*)NULL -clflushopt;immintrin.h;_mm_clflushopt;(void const*)NULL -clwb;immintrin.h;_mm_clwb;(void const*)NULL enqcmd;immintrin.h;_enqcmd;(void*)NULL,(void const*)NULL -f16c;emmintrin.h;_mm_cvtph_ps;_mm_setzero_si128() +f16c;immintrin.h;_mm_cvtph_ps;_mm_setzero_si128() fsgsbase;immintrin.h;_readfsbase_u32; fxsr;immintrin.h;_fxrstor;(void*)NULL gfni,avx512vl;immintrin.h;_mm_gf2p8mul_epi8;_mm_setzero_si128(),_mm_setzero_si128() @@ -128,11 +124,11 @@ movbe;immintrin.h;_loadbe_i16;(void const*)NULL movdir64b;immintrin.h;_movdir64b;(void*)NULL,(const void*)NULL movdiri;immintrin.h;_directstoreu_u32;(void*)NULL,(unsigned int)1 mpx;immintrin.h;_bnd_chk_ptr_lbounds;(const void*)NULL -pclmulqdq;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul +pclmul;wmmintrin.h;_mm_clmulepi64_si128;_mm_setzero_si128(),_mm_setzero_si128(),(const int)0;pclmul pconfig;immintrin.h;_pconfig_u32;(const int)1,new size_t[1] -pku;;exit;0 +pku;cstdlib;exit;0 popcnt;immintrin.h;_mm_popcnt_u32;(unsigned int)1 -prefetchw;;_m_prefetchw;(void*)NULL;prfchw +prfchw;immintrin.h;_m_prefetchw;(void*)NULL prefetchwt1;xmmintrin.h;_mm_prefetch;(char const*)NULL,(int)1 ptwrite;immintrin.h;_ptwrite32;(unsigned int)0 rdpid;immintrin.h;_rdpid_u32; @@ -149,17 +145,37 @@ vaes,avx512vl;immintrin.h;_mm256_aesdec_epi128;_mm256_setzero_si256(),_mm256_set vpclmulqdq,avx512vl;immintrin.h;_mm256_clmulepi64_epi128;_mm256_setzero_si256(),_mm256_setzero_si256(),(const int)1 waitpkg;immintrin.h;_umonitor;(void*)NULL wbnoinvd;immintrin.h;_wbnoinvd; -xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned __m256i)0 -xsaveopt,xsave;immintrin.h;xsaveopt;(void*)NULL,(unsigned __m256i)0 +xsavec,xsave;immintrin.h;_xsavec;(void*)NULL,(unsigned long long)0 +xsaveopt,xsave;immintrin.h;_xsaveopt;(void*)NULL,(unsigned long long)0 xsaves;immintrin.h;_xgetbv;(unsigned int)1 -xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned __m256i)0 +xss,xsave;immintrin.h;_xrstors;(const void*)NULL,(unsigned long long)0 + +# GNU GCC fails the following tests ... +push_disable:GNU +abm;x86intrin.h;_bextri_u32;(unsigned int)0,(unsigned int)0 +bmi;immintrin.h;_andn_u32;(unsigned int)1,(unsigned int)1 +cldemote;immintrin.h;_mm_cldemote;(void const*)NULL +clflushopt;immintrin.h;_mm_clflushopt;(void const*)NULL +clwb;immintrin.h;_mm_clwb;(void const*)NULL +pop_disable:GNU + +# ... and needs a slightly modified implementation +push_enable:GNU +abm;x86intrin.h;__bextri_u32;(unsigned int)0,(unsigned int)0 +bmi;immintrin.h;__andn_u32;(unsigned int)1,(unsigned int)1 +cldemote;immintrin.h;_cldemote;(void*)NULL +clflushopt;immintrin.h;_mm_clflushopt;(void*)NULL +clwb;immintrin.h;_mm_clwb;(void*)NULL +pop_enable:GNU pop_disable:MSVC,SunPro + # Special checks for the MSVC compiler push_enable:MSVC # SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA +abm;ammintrin.h;_bextri_u32;(unsigned int)0,(unsigned int)0 SSE;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps();sse SSE2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128();sse2 AVX;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd();avx @@ -170,6 +186,7 @@ AVX512;immintrin.h;_mm512_abs_epi32;_mm512_setzero_si512();avx512f pop_enable:MSVC + # Special checks for Oracle's SunPro compiler # https://docs.oracle.com/cd/E77782_01/html/E77792/gqexw.html push_enable:SunPro diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index df32918395..bcb823baee 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -85,12 +85,12 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "haswell") list(APPEND _march_flag_list "core-avx2") _ivybridge() - list(APPEND _available_extension_list "avx2" "fma" "bmi" "bmi2") + list(APPEND _available_extension_list "abm" "avx2" "fma" "bmi" "bmi2") endmacro() macro(_broadwell) list(APPEND _march_flag_list "broadwell") _haswell() - list(APPEND _available_extension_list "rdseed" "adcx" "prfchw") + list(APPEND _available_extension_list "rdseed" "adx" "prfchw") endmacro() macro(_skylake) list(APPEND _march_flag_list "skylake") @@ -356,9 +356,10 @@ macro(OFA_HandleX86Options) message(STATUS "[OFA] CPU architectures: " ${_str}) endif() if(_available_extension_list) + list(LENGTH _available_extension_list _len) string(REPLACE ";" ", " _str "${_available_extension_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (available): ${_str}") + message(STATUS "[OFA] Extensions (${_len} available): ${_str}") endif() endif() @@ -366,6 +367,7 @@ macro(OFA_HandleX86Options) set(_check_extension_flag_list) set(_disable_extension_flag_list) set(_enable_extension_flag_list) + set(_ignore_extension_flag_list) # Set compiler-specific option names if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") @@ -398,6 +400,7 @@ macro(OFA_HandleX86Options) elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block list(GET _check 1 _push_enable_list) + string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) list(PREPEND _skip_check FALSE) @@ -412,11 +415,14 @@ macro(OFA_HandleX86Options) elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block list(GET _check 1 _push_disable_list) + string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) list(PREPEND _skip_check TRUE) else() - list(PREPEND _skip_check FALSE) + # Compiler was not found in the list, so we keep its previous status + list(GET _skip_check 0 _skip) + list(PREPEND _skip_check ${_skip}) endif() continue() @@ -450,7 +456,7 @@ macro(OFA_HandleX86Options) else() set(_extension "${_extension_flag}") endif() - + list(APPEND _check_extension_list "${_extension}") # Define USE_<_extension_flag> variable @@ -505,6 +511,7 @@ macro(OFA_HandleX86Options) if(OFA_VERBOSE) message(STATUS "[OFA] Ignoring flag ${_enable_flag}${_extension_flag} because checks failed") endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") continue() endif() list(APPEND _enable_extension_flag_list "${_extension_flag}") @@ -517,6 +524,7 @@ macro(OFA_HandleX86Options) if(OFA_VERBOSE) message(STATUS "[OFA] Ignoring flag ${_disable_flag}${_extension_flag} because checks failed") endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") continue() endif() list(APPEND _disable_extension_flag_list "${_extension_flag}") @@ -524,19 +532,37 @@ macro(OFA_HandleX86Options) endforeach() if(OFA_VERBOSE) + # Print checked extension flags + if(_check_extension_flag_list) + list(LENGTH _check_extension_flag_list _len) + list(SORT _check_extension_flag_list) + string(REPLACE ";" ", " _str "${_check_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} checked): ${_str}") + endif() # Print enabled extension flags if(_enable_extension_flag_list) + list(LENGTH _enable_extension_flag_list _len) list(SORT _enable_extension_flag_list) string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (enabled): ${_str}") + message(STATUS "[OFA] Extensions (${_len} enabled): ${_str}") endif() # Print disabled extension flags if(_disable_extension_flag_list) + list(LENGTH _disable_extension_flag_list _len) list(SORT _disable_extension_flag_list) string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (disabled): ${_str}") + message(STATUS "[OFA] Extensions (${_len} disabled): ${_str}") + endif() + # Print ignored extension flags + if(_ignore_extension_flag_list) + list(LENGTH _ignore_extension_flag_list _len) + list(SORT _ignore_extension_flag_list) + string(REPLACE ";" ", " _str "${_ignore_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} ignored): ${_str}") endif() # Print unhandled extension flags set(_unhandled_extension_list) @@ -547,10 +573,11 @@ macro(OFA_HandleX86Options) endif() endforeach() if(_unhandled_extension_list) + list(LENGTH _unhandled_extension_list _len) list(SORT _unhandled_extension_list) string(REPLACE ";" ", " _str "${_unhandled_extension_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (unhandled): ${_str}") + message(STATUS "[OFA] Extensions (${_len} unhandled): ${_str}") endif() endif() From 6e8ea291e0d7cbf7cddf09c53cd9b369b970a535 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Tue, 25 Jan 2022 20:21:58 +0100 Subject: [PATCH 150/174] Fixed small bug in AddCXXCompilerOptions --- cmake/AddCXXCompileOptions.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index f4628a4f53..effb7abfcc 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -14,8 +14,8 @@ # CMAKE_CXXvv_EXTENSION_COMPILE_OPTIONS are not yet set by the regular # cmake routines, where vv is the value of CMAKE_CXX_STANDARD. -if(NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR - NOT CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) +if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR + NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) message(WARNING "Update your CMake installation! We fall back to compiler options back ported from CMake 3.17.5") From bf357b29225dec4ee9472460443da4f3136522dd Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Wed, 26 Jan 2022 15:22:39 +0100 Subject: [PATCH 151/174] [ci skip] Fixed small bug on OFA - X86 checks --- cmake/ofa/ChecksX86.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/ofa/ChecksX86.txt b/cmake/ofa/ChecksX86.txt index 64b418f4d7..44467c4b28 100644 --- a/cmake/ofa/ChecksX86.txt +++ b/cmake/ofa/ChecksX86.txt @@ -175,7 +175,6 @@ pop_disable:MSVC,SunPro push_enable:MSVC # SSE/SSE2/SSE3/SSE4.1/SSE4.2/SSE4A/AVX/AVX2/FMA -abm;ammintrin.h;_bextri_u32;(unsigned int)0,(unsigned int)0 SSE;xmmintrin.h;_mm_add_ps;_mm_setzero_ps(),_mm_setzero_ps();sse SSE2;emmintrin.h;_mm_add_epi16;_mm_setzero_si128(),_mm_setzero_si128();sse2 AVX;immintrin.h;_mm256_add_pd;_mm256_setzero_pd(),_mm256_setzero_pd();avx From 599d11afc284af6bff1314c7db8146248cad51be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 26 Jan 2022 17:52:13 +0100 Subject: [PATCH 152/174] [ci skip] Updated OFA --- cmake/ofa/ChecksArm.txt | 17 +++++++++- cmake/ofa/HandleArmOptions.cmake | 53 +++++++++++++++++++++++--------- cmake/ofa/HandleX86Options.cmake | 2 ++ 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/cmake/ofa/ChecksArm.txt b/cmake/ofa/ChecksArm.txt index 85745097cf..b900522e52 100644 --- a/cmake/ofa/ChecksArm.txt +++ b/cmake/ofa/ChecksArm.txt @@ -65,6 +65,9 @@ crc;cstdlib;exit;0 crypto;cstdlib;exit;0 dotprod;cstdlib;exit;0 dsp;cstdlib;exit;0 +f32mm;cstdlib;exit;0 +f64mm;cstdlib;exit;0 +flagm;cstdlib;exit;0 fp;cstdlib;exit;0 fp16;cstdlib;exit;0 fp16fml;cstdlib;exit;0 @@ -73,22 +76,34 @@ fp_sp;cstdlib;exit;0 i8mm;cstdlib;exit;0 idiv;cstdlib;exit;0 lse;cstdlib;exit;0 +memtag;cstdlib;exit;0 mve;cstdlib;exit;0 mve_fp;cstdlib;exit;0 -neon;cstdlib;exit;0 +neon;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() neon_fp16;cstdlib;exit;0 neon_vfpv4;cstdlib;exit;0 +pauth;cstdlib;exit;0 +predres;cstdlib;exit;0 +profile;cstdlib;exit;0 ras;cstdlib;exit;0 rcpc;cstdlib;exit;0 rdm;cstdlib;exit;0 rdma;cstdlib;exit;0 +rng;cstdlib;exit;0 +sb;cstdlib;exit;0 sec;cstdlib;exit;0 sha2;cstdlib;exit;0 sha3;cstdlib;exit;0 simd;cstdlib;exit;0 sm4;cstdlib;exit;0 +ssbs;cstdlib;exit;0 sve;cstdlib;exit;0 sve2;cstdlib;exit;0 +sve2-aes;cstdlib;exit;0 +sve2-bitperm;cstdlib;exit;0 +sve2-sha3;cstdlib;exit;0 +sve2-sm4;cstdlib;exit;0 +tme;cstdlib;exit;0 vfpv3;cstdlib;exit;0 vfpv3_d16;cstdlib;exit;0 vfpv3_d16_fp16;cstdlib;exit;0 diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index bee29d25bd..cd272082fa 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -791,7 +791,9 @@ macro(OFA_HandleArmOptions) if(NOT _ok) # Fallback: set -march and -mtune flags set(_check_extension_list) - set(_enable_extension_list) + set(_check_extension_flag_list) + set(_enable_extension_flag_list) + set(_ignore_extension_flag_list) foreach(_flag ${_march_flag_list}) AddCXXCompilerFlag("${_march_flag}${_flag}" RESULT _ok) @@ -820,6 +822,7 @@ macro(OFA_HandleArmOptions) elseif ("${_check}" MATCHES "^push_enable" ) # Start enable block list(GET _check 1 _push_enable_list) + string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) list(PREPEND _skip_check FALSE) @@ -834,11 +837,14 @@ macro(OFA_HandleArmOptions) elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block list(GET _check 1 _push_disable_list) + string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) list(PREPEND _skip_check TRUE) else() - list(PREPEND _skip_check FALSE) + # Compiler was not found in the list, so we keep its previous status + list(GET _skip_check 0 _skip) + list(PREPEND _skip_check ${_skip}) endif() continue() @@ -892,8 +898,6 @@ macro(OFA_HandleArmOptions) # Check if the compiler supports the -march=<_march>+<_extension_flag> # flag and can compile the provided test code with it set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") - message(${_code}) - message("${_march_flag}${_march}+${_extension_flag}") AddCXXCompilerFlag("${_march_flag}${_march}+${_extension_flag}" HEADERS ${_check_headers} CODE "${_code}" @@ -928,19 +932,39 @@ macro(OFA_HandleArmOptions) if(OFA_VERBOSE) message(STATUS "[OFA] Ignoring flag ${_march_flag}${_march}+${_extension_flag} because checks failed") endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") continue() endif() - list(APPEND _enable_extension_list "${_extension_flag}") + list(APPEND _enable_extension_flag_list "${_extension_flag}") + else() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") endif() endforeach() if(OFA_VERBOSE) + # Print checked extension flags + if(_check_extension_flag_list) + list(LENGTH _check_extension_flag_list _len) + list(SORT _check_extension_flag_list) + string(REPLACE ";" ", " _str "${_check_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} checked): ${_str}") + endif() # Print enabled extension flags - if(_enable_extension_list) - list(SORT _enable_extension_list) - string(REPLACE ";" ", " _str "${_enable_extension_list}") + if(_enable_extension_flag_list) + list(LENGTH _enable_extension_flag_list _len) + list(SORT _enable_extension_flag_list) + string(REPLACE ";" ", " _str "${_enable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} enabled): ${_str}") + endif() + # Print ignored extension flags + if(_ignore_extension_flag_list) + list(LENGTH _ignore_extension_flag_list _len) + list(SORT _ignore_extension_flag_list) + string(REPLACE ";" ", " _str "${_ignore_extension_flag_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (enabled): ${_str}") + message(STATUS "[OFA] Extensions (${_len} ignored): ${_str}") endif() # Print unhandled extension flags set(_unhandled_extension_list) @@ -951,26 +975,27 @@ macro(OFA_HandleArmOptions) endif() endforeach() if(_unhandled_extension_list) + list(LENGTH _unhandled_extension_list _len) list(SORT _unhandled_extension_list) string(REPLACE ";" ", " _str "${_unhandled_extension_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (unhandled): ${_str}") + message(STATUS "[OFA] Extensions (${_len} unhandled): ${_str}") endif() endif() # Step 3: Set compiler-specific flags (e.g., -m/-mno-) if(MSVC AND MSVC_VERSION GREATER 1900) - _ofa_find(_enable_extension_list "vfpv4" _found) + _ofa_find(_enable_extension_flag_list "vfpv4" _found) if(_found) AddCompilerFlag("/arch:VFPv4" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() if(NOT _found) - _ofa_find(_enable_extension_list "simd" _found) + _ofa_find(_enable_extension_flag_list "simd" _found) if(_found) AddCompilerFlag("/arch:ARMv7VE" CXX_FLAGS OFA_ARCHITECTURE_FLAGS CXX_RESULT _found) endif() endif() - foreach(_flag ${_enable_extension_list}) + foreach(_flag ${_enable_extension_flag_list}) string(TOUPPER "${_flag}" _flag) string(REPLACE "." "_" _flag "__${_flag}__") add_definitions("-D${_flag}") @@ -994,7 +1019,7 @@ macro(OFA_HandleArmOptions) AddCXXCompilerFlag("-march=${_march}" RESULT _ok) if(_ok) set(_march_plus_extensions "${_march}") - foreach(_flag ${_enable_extension_list}) + foreach(_flag ${_enable_extension_flag_list}) AddCXXCompilerFlag("-march=${_march_plus_extensions}+${_flag}" RESULT _ok) if(_ok) set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index bcb823baee..f42636ba5c 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -528,6 +528,8 @@ macro(OFA_HandleX86Options) continue() endif() list(APPEND _disable_extension_flag_list "${_extension_flag}") + else() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") endif() endforeach() From e8078ec4cddc55c5353c46153349b0ba7b47ffd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 28 Jan 2022 15:01:33 +0100 Subject: [PATCH 153/174] [ci skip] Updated OFA - cpuinfo x86 onder FreeBSD --- cmake/ofa/AutodetectX86.cmake | 20 ++++-- cmake/ofa/cpuinfo_x86.cxx | 130 ++++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 cmake/ofa/cpuinfo_x86.cxx diff --git a/cmake/ofa/AutodetectX86.cmake b/cmake/ofa/AutodetectX86.cmake index a720e8caf7..814fede324 100644 --- a/cmake/ofa/AutodetectX86.cmake +++ b/cmake/ofa/AutodetectX86.cmake @@ -112,11 +112,23 @@ macro(OFA_AutodetectX86) string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}") string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}") string(REGEX REPLACE ".* Stepping ([0-9]+) .*" "\\1" _cpu_mstepping "${_cpu_id}") - - # TODO: BSD, Android, QNX, ... - + else() - message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + # Try to retrieve CPUID directly + try_run(_exit _ok + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/ofa/cpuinfo_x86.cxx + RUN_OUTPUT_VARIABLE _cpuinfo) + + if(_ok AND ${_exit} EQUAL 0) + string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}") + string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}") + string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}") + string(REGEX REPLACE ".*stepping[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_stepping "${_cpuinfo}") + string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([^\n]+).*" "\\1" _cpu_flags "${_cpuinfo}") + else() + message(FATAL_ERROR "OptimizeForArchitecture.cmake does not implement support for CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") + endif() endif() # Determine CPU from CPUID diff --git a/cmake/ofa/cpuinfo_x86.cxx b/cmake/ofa/cpuinfo_x86.cxx new file mode 100644 index 0000000000..506b6286c9 --- /dev/null +++ b/cmake/ofa/cpuinfo_x86.cxx @@ -0,0 +1,130 @@ +#include +#include +#include + +int main(){ + int a[4]; + for(int i=0; i<4; ++i) + a[i] = 0; + + // EAX=0: Highest Function Parameter and Manufacturer ID + __asm__("mov $0x0, %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); + __asm__("mov %%edx, %0\n\t":"=r" (a[1])); + __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); + + char vendorID[13]; vendorID[12] = 0; + memcpy(&vendorID[0],&a[0],4); + memcpy(&vendorID[4],&a[1],4); + memcpy(&vendorID[8],&a[2],4); + + printf ("vendor_id : %s\n", vendorID); + + // EAX=1: Processor Info and Feature Bits + __asm__("mov $0x1 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (a[0])); //gives model and family + __asm__("mov %%ebx, %0\n\t":"=r" (a[1])); //gives additional feature info + __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (a[3])); //feature flags + + int stepping = a[0]>>0 & 0xF; + int model = a[0]>>4 & 0xF; + int family = a[0]>>8 & 0xF; + if(family == 6 || family == 15) + model += (a[0]>>16 & 0xF)<<4; + + printf ("cpu family : %d\n", family); + printf ("model : %d\n", model); + printf ("stepping : %d\n", stepping); + + // CPU flags + printf ("flags : "); + + // Features in EDX register + std::string edx_feature[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic" , "", "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "psn", "clflush", + "", "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "htt", "tm", "ia64", "pbe" }; + + for (int i=0; i<32; ++i) + printf ("%s", (a[3]>>i & 0x1) && !edx_feature[i].empty() ? (edx_feature[i]+" ").c_str() : ""); + + // Features in ECX register + std::string ecx_feature[] = { + "sse3", "pclmulqdq", "dtes64", "monitor", + "ds-cpl", "vmx", "smx", "est", + "tm2", "ssse3", "cnxt-id", "sdbg", + "fma", "cx16", "xtpr", "pdcm", + "", "pcid", "dca", "sse4_1", + "sse4_2", "x2apic", "movbe", "popcnt", + "tsc-deadline", "aes", "xsave", "osxsave", + "avx", "f16c", "rdrnd", "hypervisor" + }; + + for (int i=0; i<32; ++i) + printf ("%s", (a[2]>>i & 0x1) && !ecx_feature[i].empty() ? (ecx_feature[i]+" ").c_str() : ""); + + // EAX=7: Extended Features + __asm__("mov $0x7 , %eax\n\t"); + __asm__("mov $0x0 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (a[0])); //gives maximum ECX value + __asm__("mov %%ebx, %0\n\t":"=r" (a[1])); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (a[3])); //extended feature flags + + // Extended features in EBX register + std::string ebx_extended_feature[] = { + "fsgsbase", "", "sgx", "bmi1", + "hle", "avx2", "", "smep", + "bmi2", "erms", "invpcid", "rtm", + "pqm", "", "mpx", "pqe", + "avx512f", "avx512dq", "rdseed", "adx", + "smap", "avx512ifma", "pcommit", "clflushopt", + "clwb", "intelpt", "avx512pf", "avx512er", + "avx512cd", "sha", "avx512bw", "avx512vl" + }; + + for (int i=0; i<32; ++i) + printf ("%s", (a[1]>>i & 0x1) && !ebx_extended_feature[i].empty() ? (ebx_extended_feature[i]+" ").c_str() : ""); + + // Extended features in ECX register + std::string ecx_extended_feature[] = { + "prefetchwt1", "avx512vbmi", "umip", "pku", + "ospke", "waitpkg", "avx512vbmi2", "cetss", + "gfni", "vaes", "vpclmulqdq", "avx512vnni", + "avx512bitalg", "TMEEN", "avx512vpopcntdq", "", + "", "", "", "", + "", "", "rdpid", "keylocker", + "", "cldemote", "", "movdiri", + "movdir64b", "enqcmd", "sgx_lc", "pks" + }; + + for (int i=0; i<32; ++i) + printf ("%s", (a[2]>>i & 0x1) && !ecx_extended_feature[i].empty() ? (ecx_extended_feature[i]+" ").c_str() : ""); + + // Extended features in EDX register + std::string edx_extended_feature[] = { + "", "", "avx5124vnniw", "avx5124fmaps", + "fsrm", "", "", "", + "avx512vp2intersect", "SRBDS_CTRL", "md_clear", "", + "", "tsx_force_abort", "serialize", "hybrid", + "tsxldtrk", "", "pconfig", "lbr", + "cet_ibt", "", "amx-bf16", "avx512fp16", + "amx-tile", "amx-int8", "IBRS_IBPB", "stibp", + "L1D_FLUSH", "IA32_ARCH_CAPABILITIES", "IA32_CORE_CAPABILITIES", "ssbd" + }; + + for (int i=0; i<32; ++i) + printf ("%s", (a[3]>>i & 0x1) && !edx_extended_feature[i].empty() ? (edx_extended_feature[i]+" ").c_str() : ""); + + printf("\n"); + return 0; +} From 99f6d9cee9aa7e2ff9a76faa5a2695dc40895776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Fri, 28 Jan 2022 15:55:30 +0100 Subject: [PATCH 154/174] System information for FreeBSD --- src/gsCore/gsSysInfo.cpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/gsCore/gsSysInfo.cpp b/src/gsCore/gsSysInfo.cpp index d3b188ecf3..8c075778a8 100644 --- a/src/gsCore/gsSysInfo.cpp +++ b/src/gsCore/gsSysInfo.cpp @@ -21,14 +21,13 @@ #elif __APPLE__ # include # include -#elif __linux__ +#elif __linux__ || __unix__ # include # if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) # include # else # include # endif -#elif __unix__ #endif namespace gismo @@ -624,7 +623,7 @@ namespace gismo return CPUBrandString; } -#elif __linux__ +#elif __linux__ || __unix__ # if defined(__x86_64__) && ( defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__SUNCC_PRO)) char CPUBrandString[0x40]; @@ -662,10 +661,6 @@ namespace gismo # endif -#elif __unix__ - - // No generic implementation yet - #endif return "Unknown-CPU"; @@ -706,16 +701,12 @@ namespace gismo return (uint64_t)memsize; } -#elif __linux__ +#elif __linux__ || __unix__ long pages = sysconf(_SC_PHYS_PAGES); long page_size = sysconf(_SC_PAGE_SIZE); return (uint64_t)(pages * page_size); -#elif __unix__ - - // No generic implementation yet - #endif return 0; From 973909eedb616968c54de06a0cd7b3686057b78b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Sun, 30 Jan 2022 18:41:43 +0100 Subject: [PATCH 155/174] [ci skip] Updated OFA --- cmake/ofa/AddCXXCompilerFlag.cmake | 4 +-- cmake/ofa/ChecksArm.txt | 41 ++++++++++++++++-------------- cmake/ofa/HandleArmOptions.cmake | 3 ++- cmake/ofa/HandleX86Options.cmake | 4 +-- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/cmake/ofa/AddCXXCompilerFlag.cmake b/cmake/ofa/AddCXXCompilerFlag.cmake index 3b41ddfa13..e0b1537c6d 100644 --- a/cmake/ofa/AddCXXCompilerFlag.cmake +++ b/cmake/ofa/AddCXXCompilerFlag.cmake @@ -130,7 +130,7 @@ macro(AddCXXCompilerFlag _flag) foreach(_header ${_headers}) set(_resultVar "HAVE_${_header}") string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") - check_include_file_cxx(${_header} ${_resultVar} "${_flag} ${_extra_flags}") + check_include_file_cxx(${_header} ${_resultVar} "${_flag}${_extra_flags}") if(NOT ${_resultVar}) set(_check_include_file_cxx FALSE) @@ -156,7 +156,7 @@ macro(AddCXXCompilerFlag _flag) endif() set(_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") - set(CMAKE_REQUIRED_FLAGS "${_flag} ${_extra_flags}") + set(CMAKE_REQUIRED_FLAGS "${_flag}${_extra_flags}") set(_resultVar "HAVE_${_flag}") string(REGEX REPLACE "[-.+/:= ]" "_" _resultVar "${_resultVar}") check_cxx_source_compiles("${_cxx_code}" ${_resultVar} diff --git a/cmake/ofa/ChecksArm.txt b/cmake/ofa/ChecksArm.txt index b900522e52..ecbaed6472 100644 --- a/cmake/ofa/ChecksArm.txt +++ b/cmake/ofa/ChecksArm.txt @@ -60,26 +60,25 @@ # pop_enable:SunPro aes;cstdlib;exit;0 -bf16;cstdlib;exit;0 +bf16,sve;arm_sve.h;svbfdot;svfloat32_t(),svbfloat16_t(),svbfloat16_t() crc;cstdlib;exit;0 -crypto;cstdlib;exit;0 -dotprod;cstdlib;exit;0 -dsp;cstdlib;exit;0 -f32mm;cstdlib;exit;0 -f64mm;cstdlib;exit;0 +crypto;arm_neon.h;vaesdq_u8;uint8x16_t(), uint8x16_t() +dotprod;arm_neon.h;svdot;svint32_t(),svint8_t(),svint8_t() +dsp,sve;arm_sve.h;svqadd_z;svbool_t(),svint8_t(),svint8_t() +f32mm,sve;arm_sve.h;svmmla;svfloat32_t(),svfloat32_t(),svfloat32_t() +f64mm,sve;arm_sve.h;svmmla;svfloat64_t(),svfloat64_t(),svfloat64_t() flagm;cstdlib;exit;0 -fp;cstdlib;exit;0 -fp16;cstdlib;exit;0 -fp16fml;cstdlib;exit;0 +fp;arm_neon.h;vcvt_f16_f32;float32x4_t() +fp16;arm_neon.h;vabdq_f16;float16x8_t(),float16x8_t() +fp16fml;arm_neon.h;vfmlalq_high_f16;float32x4_t(),float16x8_t(),float16x8_t() fd_dp;cstdlib;exit;0 fp_sp;cstdlib;exit;0 -i8mm;cstdlib;exit;0 +i8mm,sve;arm_sve.h;svmmla;svint32_t(),svint8_t(),svint8_t() idiv;cstdlib;exit;0 lse;cstdlib;exit;0 memtag;cstdlib;exit;0 mve;cstdlib;exit;0 mve_fp;cstdlib;exit;0 -neon;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() neon_fp16;cstdlib;exit;0 neon_vfpv4;cstdlib;exit;0 pauth;cstdlib;exit;0 @@ -94,15 +93,19 @@ sb;cstdlib;exit;0 sec;cstdlib;exit;0 sha2;cstdlib;exit;0 sha3;cstdlib;exit;0 -simd;cstdlib;exit;0 -sm4;cstdlib;exit;0 +simd;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() ssbs;cstdlib;exit;0 -sve;cstdlib;exit;0 -sve2;cstdlib;exit;0 -sve2-aes;cstdlib;exit;0 -sve2-bitperm;cstdlib;exit;0 -sve2-sha3;cstdlib;exit;0 -sve2-sm4;cstdlib;exit;0 + +# SVE +sve;arm_sve.h;svwhilelt_b64;0,1 + +# SVE2 +sve2;arm_sve.h;svaba;svint8_t(),svint8_t(),svint8_t() +sve2-aes;arm_sve.h;svaesd;svuint8_t(),svuint8_t() +sve2-bitperm;arm_sve.h;svbdep;svuint8_t(),svuint8_t() +sve2-sha3;arm_sve.h;svrax1;svint64_t(),svint64_t() +sve2-sm4;arm_sve.h;svsm4e;svuint32_t(),svuint32_t() + tme;cstdlib;exit;0 vfpv3;cstdlib;exit;0 vfpv3_d16;cstdlib;exit;0 diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index cd272082fa..ed3d0e262e 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -868,8 +868,8 @@ macro(OFA_HandleArmOptions) # Convert list of extensions into compiler flags string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") list(GET _check_extension_flags 0 _extension_flag) - string(REPLACE ";" "+" _check_flags "${_check_extension_flags}") list(APPEND _check_extension_flag_list "${_extension_flag}") + string(REPLACE ";" "+" _check_extra_flags "+${_check_extension_flags}") # Extract optional extension alias list(LENGTH _check _len) @@ -899,6 +899,7 @@ macro(OFA_HandleArmOptions) # flag and can compile the provided test code with it set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") AddCXXCompilerFlag("${_march_flag}${_march}+${_extension_flag}" + EXTRA_FLAGS ${_check_extra_flags} HEADERS ${_check_headers} CODE "${_code}" RESULT _ok) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index f42636ba5c..fd6d0ad87f 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -446,8 +446,8 @@ macro(OFA_HandleX86Options) # Convert list of extensions into compiler flags string(REPLACE "," ";" _check_extension_flags "${_check_extension_flags}") list(GET _check_extension_flags 0 _extension_flag) - string(REPLACE ";" " ${_enable_flag}" _check_flags "${_enable_flag}${_check_extension_flags}") list(APPEND _check_extension_flag_list "${_extension_flag}") + string(REPLACE ";" " ${_enable_flag}" _check_extra_flags " ${_enable_flag}${_check_extension_flags}") # Extract optional extension alias list(LENGTH _check _len) @@ -477,7 +477,7 @@ macro(OFA_HandleX86Options) # flag and can compile the provided test code with it set(_code "\nint main() { ${_check_function}(${_check_params})\; return 0\; }") AddCXXCompilerFlag("${_enable_flag}${_extension_flag}" - EXTRA_FLAGS ${_check_flags} + EXTRA_FLAGS ${_check_extra_flags} HEADERS ${_check_headers} CODE "${_code}" RESULT _ok) From 29a58ca3c6571afdb1ae06e76ae3f6abb8d6a695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Mon, 31 Jan 2022 15:02:24 +0100 Subject: [PATCH 156/174] [ci skip] Updated OFA --- cmake/ofa/cpuinfo_x86.cxx | 255 ++++++++++++++++++++++++-------------- 1 file changed, 165 insertions(+), 90 deletions(-) diff --git a/cmake/ofa/cpuinfo_x86.cxx b/cmake/ofa/cpuinfo_x86.cxx index 506b6286c9..0f24de3330 100644 --- a/cmake/ofa/cpuinfo_x86.cxx +++ b/cmake/ofa/cpuinfo_x86.cxx @@ -2,12 +2,18 @@ #include #include -int main(){ - int a[4]; - for(int i=0; i<4; ++i) +#define print_features(reg,features,n) \ + for (int i=0; i>i & 0x1) && !features[i].empty() \ + ? (features[i]+" ").c_str() : ""); + +// Get the vendor ID +void getVendorID() { + int32_t a[3]; + for(int i=0; i<3; ++i) a[i] = 0; - // EAX=0: Highest Function Parameter and Manufacturer ID + // EAX=0: Vendor ID __asm__("mov $0x0, %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); @@ -20,111 +26,180 @@ int main(){ memcpy(&vendorID[8],&a[2],4); printf ("vendor_id : %s\n", vendorID); +} - // EAX=1: Processor Info and Feature Bits +// Get processor information +void getProcInfo() { + int32_t eax = 0; + + // EAX=1: Processor Info __asm__("mov $0x1 , %eax\n\t"); __asm__("cpuid\n\t"); - __asm__("mov %%eax, %0\n\t":"=r" (a[0])); //gives model and family - __asm__("mov %%ebx, %0\n\t":"=r" (a[1])); //gives additional feature info - __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); //feature flags - __asm__("mov %%edx, %0\n\t":"=r" (a[3])); //feature flags + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //gives model and family - int stepping = a[0]>>0 & 0xF; - int model = a[0]>>4 & 0xF; - int family = a[0]>>8 & 0xF; + int32_t stepping = eax>>0 & 0xF; + int32_t model = eax>>4 & 0xF; + int32_t family = eax>>8 & 0xF; if(family == 6 || family == 15) - model += (a[0]>>16 & 0xF)<<4; + model += (eax>>16 & 0xF)<<4; printf ("cpu family : %d\n", family); printf ("model : %d\n", model); printf ("stepping : %d\n", stepping); +} + +// Get processor features +void getFeatures() { + int32_t a[3], eax,ebx,ecx,edx; + for(int i=0; i<3; ++i) + a[i] = 0; // CPU flags printf ("flags : "); - - // Features in EDX register - std::string edx_feature[] = { - "fpu", "vme", "de", "pse", - "tsc", "msr", "pae", "mce", - "cx8", "apic" , "", "sep", - "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "psn", "clflush", - "", "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", - "htt", "tm", "ia64", "pbe" }; - - for (int i=0; i<32; ++i) - printf ("%s", (a[3]>>i & 0x1) && !edx_feature[i].empty() ? (edx_feature[i]+" ").c_str() : ""); - // Features in ECX register - std::string ecx_feature[] = { - "sse3", "pclmulqdq", "dtes64", "monitor", - "ds-cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cnxt-id", "sdbg", - "fma", "cx16", "xtpr", "pdcm", - "", "pcid", "dca", "sse4_1", - "sse4_2", "x2apic", "movbe", "popcnt", - "tsc-deadline", "aes", "xsave", "osxsave", - "avx", "f16c", "rdrnd", "hypervisor" - }; - - for (int i=0; i<32; ++i) - printf ("%s", (a[2]>>i & 0x1) && !ecx_feature[i].empty() ? (ecx_feature[i]+" ").c_str() : ""); - - // EAX=7: Extended Features - __asm__("mov $0x7 , %eax\n\t"); - __asm__("mov $0x0 , %ecx\n\t"); + // EAX=0: Vendor ID + __asm__("mov $0x0, %eax\n\t"); __asm__("cpuid\n\t"); - __asm__("mov %%eax, %0\n\t":"=r" (a[0])); //gives maximum ECX value - __asm__("mov %%ebx, %0\n\t":"=r" (a[1])); //extended feature flags - __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); //extended feature flags - __asm__("mov %%edx, %0\n\t":"=r" (a[3])); //extended feature flags - - // Extended features in EBX register - std::string ebx_extended_feature[] = { - "fsgsbase", "", "sgx", "bmi1", - "hle", "avx2", "", "smep", - "bmi2", "erms", "invpcid", "rtm", - "pqm", "", "mpx", "pqe", - "avx512f", "avx512dq", "rdseed", "adx", - "smap", "avx512ifma", "pcommit", "clflushopt", - "clwb", "intelpt", "avx512pf", "avx512er", - "avx512cd", "sha", "avx512bw", "avx512vl" - }; + __asm__("mov %%eax, %0\n\t":"=r" (eax)); + __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); + __asm__("mov %%edx, %0\n\t":"=r" (a[1])); + __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); + + char vendorID[13]; vendorID[12] = 0; + memcpy(&vendorID[0],&a[0],4); + memcpy(&vendorID[4],&a[1],4); + memcpy(&vendorID[8],&a[2],4); - for (int i=0; i<32; ++i) - printf ("%s", (a[1]>>i & 0x1) && !ebx_extended_feature[i].empty() ? (ebx_extended_feature[i]+" ").c_str() : ""); + if (strcmp(vendorID, "GenuineIntel") == 0) { - // Extended features in ECX register - std::string ecx_extended_feature[] = { - "prefetchwt1", "avx512vbmi", "umip", "pku", - "ospke", "waitpkg", "avx512vbmi2", "cetss", - "gfni", "vaes", "vpclmulqdq", "avx512vnni", - "avx512bitalg", "TMEEN", "avx512vpopcntdq", "", - "", "", "", "", - "", "", "rdpid", "keylocker", - "", "cldemote", "", "movdiri", - "movdir64b", "enqcmd", "sgx_lc", "pks" - }; + if (eax >= 1) { + + // EAX=1: Processor Info and Feature Bits + __asm__("mov $0x1 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + + // Features in EDX register + { + std::string features[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic" , "", "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "psn", "clflush", + "", "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "htt", "tm", "ia64", "pbe" }; + print_features(edx, features, 32); + } + + // Features in ECX register + { + std::string features[] = { + "sse3", "pclmulqdq", "dtes64", "monitor", + "ds-cpl", "vmx", "smx", "est", + "tm2", "ssse3", "cnxt-id", "sdbg", + "fma", "cx16", "xtpr", "pdcm", + "", "pcid", "dca", "sse4_1", + "sse4_2", "x2apic", "movbe", "popcnt", + "tsc-deadline", "aes", "xsave", "osxsave", + "avx", "f16c", "rdrnd", "hypervisor" + }; + print_features(ecx, features, 32); + } + } + + if (eax >=7) { + // EAX=7, ECX=0: Extended Features + __asm__("mov $0x7 , %eax\n\t"); + __asm__("mov $0x0 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //gives maximum ECX value + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // Extended features in EBX register + { + std::string features[] = { + "fsgsbase", "", "sgx", "bmi1", + "hle", "avx2", "", "smep", + "bmi2", "erms", "invpcid", "rtm", + "pqm", "", "mpx", "pqe", + "avx512f", "avx512dq", "rdseed", "adx", + "smap", "avx512ifma", "pcommit", "clflushopt", + "clwb", "intelpt", "avx512pf", "avx512er", + "avx512cd", "sha", "avx512bw", "avx512vl" + }; + print_features(ebx, features, 32); + } + + // Extended features in ECX register + { + std::string features[] = { + "prefetchwt1", "avx512vbmi", "umip", "pku", + "ospke", "waitpkg", "avx512vbmi2", "cetss", + "gfni", "vaes", "vpclmulqdq", "avx512vnni", + "avx512bitalg", "TMEEN", "avx512vpopcntdq", "", + "", "", "", "", + "", "", "rdpid", "keylocker", + "", "cldemote", "", "movdiri", + "movdir64b", "enqcmd", "sgx_lc", "pks" + }; + print_features(ecx, features, 32); + } + + // Extended features in EDX register + { + std::string features[] = { + "", "", "avx5124vnniw", "avx5124fmaps", + "fsrm", "", "", "", + "avx512vp2intersect", "SRBDS_CTRL", "md_clear", "", + "", "tsx_force_abort", "serialize", "hybrid", + "tsxldtrk", "", "pconfig", "lbr", + "cet_ibt", "", "amx-bf16", "avx512fp16", + "amx-tile", "amx-int8", "IBRS_IBPB", "stibp", + "L1D_FLUSH", "IA32_ARCH_CAPABILITIES", "IA32_CORE_CAPABILITIES", "ssbd" + }; + print_features(edx, features, 32); + } - for (int i=0; i<32; ++i) - printf ("%s", (a[2]>>i & 0x1) && !ecx_extended_feature[i].empty() ? (ecx_extended_feature[i]+" ").c_str() : ""); + if (eax >= 1) { - // Extended features in EDX register - std::string edx_extended_feature[] = { - "", "", "avx5124vnniw", "avx5124fmaps", - "fsrm", "", "", "", - "avx512vp2intersect", "SRBDS_CTRL", "md_clear", "", - "", "tsx_force_abort", "serialize", "hybrid", - "tsxldtrk", "", "pconfig", "lbr", - "cet_ibt", "", "amx-bf16", "avx512fp16", - "amx-tile", "amx-int8", "IBRS_IBPB", "stibp", - "L1D_FLUSH", "IA32_ARCH_CAPABILITIES", "IA32_CORE_CAPABILITIES", "ssbd" - }; + // Extended features in EAX register + { + std::string features[] = { + "", "", "", "", + "", "avx512bf16", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "" + }; + print_features(eax, features, 32); + } + } + } + } + else if (strcmp(vendorID, "AuthenticAMD") == 0) { - for (int i=0; i<32; ++i) - printf ("%s", (a[3]>>i & 0x1) && !edx_extended_feature[i].empty() ? (edx_extended_feature[i]+" ").c_str() : ""); + // // EAX=7, ECX=1: Extended Features + // __asm__("mov $0x7 , %eax\n\t"); + // __asm__("mov $0x1 , %ecx\n\t"); + // __asm__("cpuid\n\t"); + // __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + + } printf("\n"); - return 0; +} + +int main(){ + getVendorID(); + getProcInfo(); + getFeatures(); + return 0; } From 03a299ecde6e8ed3ad9e50287b6bcc0b61d3c556 Mon Sep 17 00:00:00 2001 From: Matthias Moeller Date: Tue, 1 Feb 2022 12:41:20 +0100 Subject: [PATCH 157/174] [ci skip] Update OFA - x86 completed --- cmake/ofa/cpuinfo_x86.cxx | 522 +++++++++++++++++++++++++++++--------- 1 file changed, 399 insertions(+), 123 deletions(-) diff --git a/cmake/ofa/cpuinfo_x86.cxx b/cmake/ofa/cpuinfo_x86.cxx index 0f24de3330..a285d09818 100644 --- a/cmake/ofa/cpuinfo_x86.cxx +++ b/cmake/ofa/cpuinfo_x86.cxx @@ -2,9 +2,9 @@ #include #include -#define print_features(reg,features,n) \ - for (int i=0; i>i & 0x1) && !features[i].empty() \ +#define print_features(reg,features,n) \ + for (int i=0; i>i & 0x1) && !features[i].empty() \ ? (features[i]+" ").c_str() : ""); // Get the vendor ID @@ -50,149 +50,425 @@ void getProcInfo() { // Get processor features void getFeatures() { - int32_t a[3], eax,ebx,ecx,edx; - for(int i=0; i<3; ++i) - a[i] = 0; + int32_t eax_max,ecx_max,eax,ebx,ecx,edx; // CPU flags printf ("flags : "); - // EAX=0: Vendor ID + // EAX=0: largest value that EAX can be set to before calling CPUID __asm__("mov $0x0, %eax\n\t"); __asm__("cpuid\n\t"); - __asm__("mov %%eax, %0\n\t":"=r" (eax)); - __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); - __asm__("mov %%edx, %0\n\t":"=r" (a[1])); - __asm__("mov %%ecx, %0\n\t":"=r" (a[2])); + __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); - char vendorID[13]; vendorID[12] = 0; - memcpy(&vendorID[0],&a[0],4); - memcpy(&vendorID[4],&a[1],4); - memcpy(&vendorID[8],&a[2],4); + printf("\neax_max:%d\n", eax_max); - if (strcmp(vendorID, "GenuineIntel") == 0) { - - if (eax >= 1) { + if (eax_max >= 1) { - // EAX=1: Processor Info and Feature Bits - __asm__("mov $0x1 , %eax\n\t"); - __asm__("cpuid\n\t"); - __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags - __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + // EAX=1: Processor Info and Feature Bits + __asm__("mov $0x1 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags - // Features in EDX register - { - std::string features[] = { - "fpu", "vme", "de", "pse", - "tsc", "msr", "pae", "mce", - "cx8", "apic" , "", "sep", - "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "psn", "clflush", - "", "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", - "htt", "tm", "ia64", "pbe" }; - print_features(edx, features, 32); - } + // Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 + { + std::string features[] = { "fpu", /* Onboard FPU */ + "vme", /* Virtual Mode Extensions */ + "de", /* Debugging Extensions */ + "pse", /* Page Size Extensions */ + "tsc", /* Time Stamp Counter */ + "msr", /* Model-Specific Registers */ + "pae", /* Physical Address Extensions */ + "mce", /* Machine Check Exception */ + "cx8", /* CMPXCHG8 instruction */ + "apic", /* Onboard APIC */ + "", /* Reserved */ + "sep", /* SYSENTER/SYSEXIT */ + "mtrr", /* Memory Type Range Registers */ + "pge", /* Page Global Enable */ + "mca", /* Machine Check Architecture */ + "cmov", /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + "pat", /* Page Attribute Table */ + "pse36", /* 36-bit PSEs */ + "pn", /* Processor serial number */ + "clflush", /* CLFLUSH instruction */ + "", /* Reserved */ + "dts", /* "dts" Debug Store */ + "acpi", /* ACPI via MSR */ + "mmx", /* Multimedia Extensions */ + "fxsr", /* FXSAVE/FXRSTOR, CR4.OSFXSR */ + "sse", /* "sse" */ + "sse2", /* "sse2" */ + "ss", /* "ss" CPU self snoop */ + "ht", /* Hyper-Threading */ + "tm", /* "tm" Automatic clock control */ + "ia64", /* IA-64 processor */ + "pbe" /* Pending Break Enable */ + }; + printf("INTEL 0x00000001 (EDX)"); + print_features(edx, features, 32); + } + + // Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 + { + std::string features[] = { "sse3", /* "pni" SSE-3 */ + "pclmulqdq", /* PCLMULQDQ instruction */ + "dtes64", /* 64-bit Debug Store */ + "monitor", /* "monitor" MONITOR/MWAIT support */ + "ds_cpl", /* "ds_cpl" CPL-qualified (filtered) Debug Store */ + "vmx", /* Hardware virtualization */ + "smx", /* Safer Mode eXtensions */ + "est", /* Enhanced SpeedStep */ + "tm2", /* Thermal Monitor 2 */ + "ssse3", /* Supplemental SSE-3 */ + "cid", /* Context ID */ + "sdbg", /* Silicon Debug */ + "fma", /* Fused multiply-add */ + "cx16", /* CMPXCHG16B instruction */ + "xtpr", /* Send Task Priority Messages */ + "pdcm", /* Perf/Debug Capabilities MSR */ + "", /* Reserved */ + "pcid", /* Process Context Identifiers */ + "dca", /* Direct Cache Access */ + "sse4_1", /* "sse4_1" SSE-4.1 */ + "sse4_2", /* "sse4_2" SSE-4.2 */ + "x2apic", /* X2APIC */ + "movbe", /* MOVBE instruction */ + "popcnt", /* POPCNT instruction */ + "tsc_deadline_timer", /* TSC deadline timer */ + "aes", /* AES instructions */ + "xsave", /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ + "", /* "" XSAVE instruction enabled in the OS */ + "avx", /* Advanced Vector Extensions */ + "f16c", /* 16-bit FP conversions */ + "rdrand", /* RDRAND instruction */ + "hypervisor" /* Running on a hypervisor */ + }; + printf("INTEL 0x00000001 (ECX)"); + print_features(ecx, features, 32); + } + } // EAX=1 + + if (eax_max >= 7) { + // EAX=7, ECX=0: Extended Features + __asm__("mov $0x7 , %eax\n\t"); + __asm__("mov $0x0 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (ecx_max)); //gives maximum ECX value + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags - // Features in ECX register - { - std::string features[] = { - "sse3", "pclmulqdq", "dtes64", "monitor", - "ds-cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cnxt-id", "sdbg", - "fma", "cx16", "xtpr", "pdcm", - "", "pcid", "dca", "sse4_1", - "sse4_2", "x2apic", "movbe", "popcnt", - "tsc-deadline", "aes", "xsave", "osxsave", - "avx", "f16c", "rdrnd", "hypervisor" - }; - print_features(ecx, features, 32); - } + // Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 + { + std::string features[] = { "fsgsbase", /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ + "tsc_adjust", /* TSC adjustment MSR 0x3B */ + "sgx", /* Software Guard Extensions */ + "bmi1", /* 1st group bit manipulation extensions */ + "hle", /* Hardware Lock Elision */ + "avx2", /* AVX2 instructions */ + "", /* "" FPU data pointer updated only on x87 exceptions */ + "smep", /* Supervisor Mode Execution Protection */ + "bmi2", /* 2nd group bit manipulation extensions */ + "erms", /* Enhanced REP MOVSB/STOSB instructions */ + "invpcid", /* Invalidate Processor Context ID */ + "rtm", /* Restricted Transactional Memory */ + "cqm", /* Cache QoS Monitoring */ + "", /* "" Zero out FPU CS and FPU DS */ + "mpx", /* Memory Protection Extension */ + "rdt_a", /* Resource Director Technology Allocation */ + "avx512f", /* AVX-512 Foundation */ + "avx512dq", /* AVX-512 DQ (Double/Quad granular) Instructions */ + "rdseed", /* RDSEED instruction */ + "adx", /* ADCX and ADOX instructions */ + "smap", /* Supervisor Mode Access Prevention */ + "avx512ifma", /* AVX-512 Integer Fused Multiply-Add instructions */ + "pcommit", + "clflushopt", /* CLFLUSHOPT instruction */ + "clwb", /* CLWB instruction */ + "intel_pt", /* Intel Processor Trace */ + "avx512pf", /* AVX-512 Prefetch */ + "avx512er", /* AVX-512 Exponential and Reciprocal */ + "avx512cd", /* AVX-512 Conflict Detection */ + "sha_ni", /* SHA1/SHA256 Instruction Extensions */ + "avx512bw", /* AVX-512 BW (Byte/Word granular) Instructions */ + "avx512vl" /* AVX-512 VL (128/256 Vector Length) Extensions */ + }; + printf("INTEL 0x00000007:0 (EBX)"); + print_features(ebx, features, 32); } + + // Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 + { + std::string features[] = { "prefetchwt1", + "avx512vbmi", /* AVX512 Vector Bit Manipulation instructions*/ + "umip", /* User Mode Instruction Protection */ + "pku", /* Protection Keys for Userspace */ + "ospke", /* OS Protection Keys Enable */ + "waitpkg", /* UMONITOR/UMWAIT/TPAUSE Instructions */ + "avx512vbmi2", /* Additional AVX512 Vector Bit Manipulation Instructions */ + "cetss", + "gfni", /* Galois Field New Instructions */ + "vaes", /* Vector AES */ + "vpclmulqdq", /* Carry-Less Multiplication Double Quadword */ + "avx512vnni", /* Vector Neural Network Instructions */ + "avx512bitalg", /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ + "tme", /* Intel Total Memory Encryption */ + "avx512vpopcntdq", /* POPCNT for vectors of DW/QW */ + "", /* Reserved */ + "la57", /* 5-level page tables */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "rdpid", /* RDPID instruction */ + "keylocker", + "bus_lock_detect", /* Bus Lock detect */ + "cldemote", /* CLDEMOTE instruction */ + "", /* Reserved */ + "movdiri", /* MOVDIRI instruction */ + "movdir64b", /* MOVDIR64B instruction */ + "enqcmd", /* ENQCMD and ENQCMDS instructions */ + "sgx_lc", /* Software Guard Extensions Launch Control */ + "pks" + }; + printf("INTEL 0x00000007:0 (ECX)"); + print_features(ecx, features, 32); + } + + // Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 + { + std::string features[] = { "", /* Reserved */ + "", /* Reserved */ + "avx5124vnniw", /* AVX-512 Neural Network Instructions */ + "avx5124fmaps", /* AVX-512 Multiply Accumulation Single precision */ + "fsrm", /* Fast Short Rep Mov */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "avx512vp2intersect", /* AVX-512 Intersect for D/Q */ + "srbds", /* "" SRBDS mitigation MSR available */ + "md_clear", /* VERW clears CPU buffers */ + "", /* "" RTM transaction always aborts */ + "", /* Reserved */ + "", /* "" TSX_FORCE_ABORT */ + "serialize", /* SERIALIZE instruction */ + "", /* "" This part has CPUs of more than one type */ + "tsxldtrk", /* TSX Suspend Load Address Tracking */ + "", /* Reserved */ + "pconfig", /* Intel PCONFIG */ + "arch_lbr", /* Intel ARCH LBR */ + "cet_ibt", + "", /* Reserved */ + "amx-bf16", /* AMX BFLOAT16 Support */ + "avx512fp16", /* AVX512 FP16 */ + "amx-tile", /* AMX tile Support */ + "amx-int8", /* AMX int8 Support */ + "ibrs ibpb", /* "" Speculation Control (IBRS + IBPB) */ + "stibp", /* "" Single Thread Indirect Branch Predictors */ + "flush_l1d", /* Flush L1D cache */ + "arch_capabilities", /* IA32_ARCH_CAPABILITIES MSR (Intel) */ + "", /* "" IA32_CORE_CAPABILITIES MSR */ + "ssbd" /* "" Speculative Store Bypass Disable */ + }; + printf("INTEL 0x00000007:0 (EDX)"); + print_features(edx, features, 32); + } + + printf("\necx_max:%d\n", ecx_max); - if (eax >=7) { - // EAX=7, ECX=0: Extended Features + if (ecx_max >= 1) { + // EAX=7, ECX=1: Extended Features __asm__("mov $0x7 , %eax\n\t"); - __asm__("mov $0x0 , %ecx\n\t"); + __asm__("mov $0x1 , %ecx\n\t"); __asm__("cpuid\n\t"); - __asm__("mov %%eax, %0\n\t":"=r" (eax)); //gives maximum ECX value - __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags - __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags - __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags - // Extended features in EBX register + // Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 { - std::string features[] = { - "fsgsbase", "", "sgx", "bmi1", - "hle", "avx2", "", "smep", - "bmi2", "erms", "invpcid", "rtm", - "pqm", "", "mpx", "pqe", - "avx512f", "avx512dq", "rdseed", "adx", - "smap", "avx512ifma", "pcommit", "clflushopt", - "clwb", "intelpt", "avx512pf", "avx512er", - "avx512cd", "sha", "avx512bw", "avx512vl" + std::string features[] = { "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "avx_vnni", /* AVX VNNI instructions */ + "avx512bf16", /* AVX512 BFLOAT16 instructions */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ }; - print_features(ebx, features, 32); - } - - // Extended features in ECX register - { - std::string features[] = { - "prefetchwt1", "avx512vbmi", "umip", "pku", - "ospke", "waitpkg", "avx512vbmi2", "cetss", - "gfni", "vaes", "vpclmulqdq", "avx512vnni", - "avx512bitalg", "TMEEN", "avx512vpopcntdq", "", - "", "", "", "", - "", "", "rdpid", "keylocker", - "", "cldemote", "", "movdiri", - "movdir64b", "enqcmd", "sgx_lc", "pks" - }; - print_features(ecx, features, 32); - } - - // Extended features in EDX register - { - std::string features[] = { - "", "", "avx5124vnniw", "avx5124fmaps", - "fsrm", "", "", "", - "avx512vp2intersect", "SRBDS_CTRL", "md_clear", "", - "", "tsx_force_abort", "serialize", "hybrid", - "tsxldtrk", "", "pconfig", "lbr", - "cet_ibt", "", "amx-bf16", "avx512fp16", - "amx-tile", "amx-int8", "IBRS_IBPB", "stibp", - "L1D_FLUSH", "IA32_ARCH_CAPABILITIES", "IA32_CORE_CAPABILITIES", "ssbd" - }; - print_features(edx, features, 32); - } + printf("INTEL 0x00000007:1 (EAX)"); + print_features(eax, features, 32); + } + } // ECX=1 + } // EAX=7 - if (eax >= 1) { + // EAX=0: largest value that EAX can be set to before calling CPUID + __asm__("mov $0x80000000, %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); - // Extended features in EAX register - { - std::string features[] = { - "", "", "", "", - "", "avx512bf16", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "" - }; - print_features(eax, features, 32); - } - } + printf("\neax_max:%d\n", eax_max); + + if (eax_max >= 1) { + + // EAX=80000001h: Processor Info and Feature Bits + __asm__("mov $0x80000001 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + + // AMD-defined CPU features, CPUID level 0x80000001 (EDX), word 1 + // Don't duplicate feature flags which are redundant with Intel! + { + std::string features[] = { "", /* Onboard FPU */ + "", /* Virtual Mode Extensions */ + "", /* Debugging Extensions */ + "", /* Page Size Extensions */ + "", /* Time Stamp Counter */ + "", /* Model-Specific Registers */ + "", /* Physical Address Extensions */ + "", /* Machine Check Exception */ + "", /* CMPXCHG8 instruction */ + "", /* Onboard APIC */ + "", /* Reserved */ + "syscall", /* SYSCALL/SYSRET */ + "", /* Memory Type Range Registers */ + "", /* Page Global Enable */ + "", /* Machine Check Architecture */ + "", /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + "", /* Page Attribute Table */ + "", /* 36-bit PSEs */ + "", /* Reserved */ + "mp", /* MP Capable */ + "nx", /* Execute Disable */ + "", /* Reserved */ + "mmxext", /* AMD MMX extensions */ + "", /* Multimedia Extensions */ + "", /* FXSAVE/FXRSTOR, CR4.OSFXSR */ + "fxsr_opt", /* FXSAVE/FXRSTOR optimizations */ + "pdpe1gb", /* "pdpe1gb" GB pages */ + "rdtscp", /* RDTSCP */ + "", /* Reserved */ + "lm", /* Long Mode (x86-64, 64-bit support) */ + "3dnowext", /* AMD 3DNow extensions */ + "3dnow" /* 3DNow */ + }; + print_features(edx, features, 32); } - } - else if (strcmp(vendorID, "AuthenticAMD") == 0) { + + // AMD-defined CPU features, CPUID level 0x80000001 (ECX), word 6 + { + std::string features[] = { "lahf_lm", /* LAHF/SAHF in long mode */ + "cmp_legacy", /* If yes HyperThreading not valid */ + "svm", /* Secure Virtual Machine */ + "extapic", /* Extended APIC space */ + "cr8_legacy", /* CR8 in 32-bit mode */ + "abm", /* Advanced bit manipulation */ + "sse4a", /* SSE-4A */ + "misalignsse", /* Misaligned SSE mode */ + "3dnowprefetch", /* 3DNow prefetch instructions */ + "osvw", /* OS Visible Workaround */ + "ibs", /* Instruction Based Sampling */ + "xop", /* extended AVX instructions */ + "skinit", /* SKINIT/STGI instructions */ + "wdt", /* Watchdog timer */ + "", /* Reserved */ + "lwp", /* Light Weight Profiling */ + "fma4", /* 4 operands MAC instructions */ + "tce", /* Translation Cache Extension */ + "", /* Reserved */ + "nodeid_msr", /* NodeId MSR */ + "", /* Reserved */ + "tbm", /* Trailing Bit Manipulations */ + "topoext", /* Topology extensions CPUID leafs */ + "perfctr_core", /* Core performance counter extensions */ + "perfctr_nb", /* NB performance counter extensions */ + "", /* Reserved */ + "bpext", /* Data breakpoint extension */ + "ptsc", /* Performance time-stamp counter */ + "perfctr_llc", /* Last Level Cache performance counter extensions */ + "mwaitx", /* MWAIT extension (MONITORX/MWAITX instructions) */ + "", /* Reserved */ + "" /* Reserved */ + + }; + print_features(ecx, features, 32); + } + } // EAX=1 - // // EAX=7, ECX=1: Extended Features - // __asm__("mov $0x7 , %eax\n\t"); - // __asm__("mov $0x1 , %ecx\n\t"); - // __asm__("cpuid\n\t"); - // __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + if (eax_max >=7) { + // EAX=7, ECX=0: Extended Features + __asm__("mov $0x7 , %eax\n\t"); + __asm__("mov $0x0 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (ecx_max)); //gives maximum ECX value + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags - } + //THIS IS NOT YET WORKING + + { + std::string features[] = { "cxmmx", /* Cyrix MMX extensions */ + "k6_mtrr", /* AMD K6 nonstandard MTRRs */ + "cyrix_arr", /* Cyrix ARRs (= MTRRs) */ + "centaur_mcr", /* Centaur MCRs (= MTRRs) */ + "k8", /* "" Opteron, Athlon64 */ + "", /* "" Athlon */ + "", /* "" P3 */ + "", /* "" P4 */ + "constant_tsc", /* TSC ticks at a constant rate */ + "up", /* SMP kernel running on UP */ + "art", /* Always running timer (ART) */ + "arch_perfmon", /* Intel Architectural PerfMon */ + "pebs", /* Precise-Event Based Sampling */ + "bts", /* Branch Trace Store */ + "", /* "" syscall in IA32 userspace */ + "", /* "" sysenter in IA32 userspace */ + "rep_good", /* REP microcode works well */ + "", /* Reserved */ + "", /* "" LFENCE synchronizes RDTSC */ + "acc_power", /* AMD Accumulated Power Mechanism */ + "nopl", /* The NOPL (0F 1F) instructions */ + "", /* "" Always-present feature */ + "xtopology", /* CPU topology enum extensions */ + "tsc_reliable", /* TSC is known to be reliable */ + "nonstop_tsc", /* TSC does not stop in C states */ + "cpuid", /* CPU has CPUID instruction itself */ + "extd_apicid", /* Extended APICID (8 bits) */ + "amd_dcm", /* AMD multi-node processor */ + "aperfmperf", /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ + "rapl", /* AMD/Hygon RAPL interface */ + "nonstop_tsc_s3", /* TSC doesn't stop in S3 state */ + "tsc_known_freq" /* TSC has known frequency */ + }; + //print_features(eax, features, 32); + } + } // EAX=7 printf("\n"); } From 10f9be6ad333fbbb422aa73276ab99294b36f8bf Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Wed, 2 Feb 2022 10:33:20 +0100 Subject: [PATCH 158/174] [ci skip] Update OFA - x86 completed --- cmake/ofa/cpuinfo_x86.cxx | 440 +++++++++++++++++++++++++++++--------- 1 file changed, 335 insertions(+), 105 deletions(-) diff --git a/cmake/ofa/cpuinfo_x86.cxx b/cmake/ofa/cpuinfo_x86.cxx index a285d09818..b4bd46f2c3 100644 --- a/cmake/ofa/cpuinfo_x86.cxx +++ b/cmake/ofa/cpuinfo_x86.cxx @@ -9,12 +9,12 @@ // Get the vendor ID void getVendorID() { - int32_t a[3]; + int a[3]; for(int i=0; i<3; ++i) a[i] = 0; - - // EAX=0: Vendor ID - __asm__("mov $0x0, %eax\n\t"); + + // EAX=0x00000000: Vendor ID + __asm__("mov $0x00000000, %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%ebx, %0\n\t":"=r" (a[0])); __asm__("mov %%edx, %0\n\t":"=r" (a[1])); @@ -24,22 +24,22 @@ void getVendorID() { memcpy(&vendorID[0],&a[0],4); memcpy(&vendorID[4],&a[1],4); memcpy(&vendorID[8],&a[2],4); - + printf ("vendor_id : %s\n", vendorID); } // Get processor information void getProcInfo() { - int32_t eax = 0; + int eax = 0; - // EAX=1: Processor Info - __asm__("mov $0x1 , %eax\n\t"); + // EAX=0x00000001: Processor Info + __asm__("mov $0x00000001 , %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%eax, %0\n\t":"=r" (eax)); //gives model and family - int32_t stepping = eax>>0 & 0xF; - int32_t model = eax>>4 & 0xF; - int32_t family = eax>>8 & 0xF; + int stepping = eax>>0 & 0xF; + int model = eax>>4 & 0xF; + int family = eax>>8 & 0xF; if(family == 6 || family == 15) model += (eax>>16 & 0xF)<<4; @@ -50,26 +50,28 @@ void getProcInfo() { // Get processor features void getFeatures() { - int32_t eax_max,ecx_max,eax,ebx,ecx,edx; + int eax_max,ecx_max,eax,ebx,ecx,edx; + + // Note: If the comment begins with a quoted string, that string is + // used in /proc/cpuinfo instead of the macro name. If the string is + // "", this feature bit is not displayed in /proc/cpuinfo at all. // CPU flags printf ("flags : "); - - // EAX=0: largest value that EAX can be set to before calling CPUID - __asm__("mov $0x0, %eax\n\t"); + + // EAX=0x00000000: largest value that EAX can be set to before calling CPUID + __asm__("mov $0x00000000, %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); - printf("\neax_max:%d\n", eax_max); - - if (eax_max >= 1) { - - // EAX=1: Processor Info and Feature Bits - __asm__("mov $0x1 , %eax\n\t"); + if (eax_max >= 0x00000001) { + + // EAX=0x00000001: Processor Info and Feature Bits + __asm__("mov $0x00000001 , %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags - __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags - + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags + // Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 { std::string features[] = { "fpu", /* Onboard FPU */ @@ -105,10 +107,9 @@ void getFeatures() { "ia64", /* IA-64 processor */ "pbe" /* Pending Break Enable */ }; - printf("INTEL 0x00000001 (EDX)"); print_features(edx, features, 32); } - + // Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 { std::string features[] = { "sse3", /* "pni" SSE-3 */ @@ -144,21 +145,69 @@ void getFeatures() { "rdrand", /* RDRAND instruction */ "hypervisor" /* Running on a hypervisor */ }; - printf("INTEL 0x00000001 (ECX)"); print_features(ecx, features, 32); } - } // EAX=1 - - if (eax_max >= 7) { - // EAX=7, ECX=0: Extended Features - __asm__("mov $0x7 , %eax\n\t"); - __asm__("mov $0x0 , %ecx\n\t"); + } // EAX=0x00000001 + + // if (eax_max >=0x00000006) { + // // EAX=0x00000006: Extended Features + // __asm__("mov $0x00000006 , %eax\n\t"); + // __asm__("cpuid\n\t"); + // __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + // __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + // __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + // __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // // Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 + + // { + // std::string features[] = { "cxmmx", /* Cyrix MMX extensions */ + // "k6_mtrr", /* AMD K6 nonstandard MTRRs */ + // "cyrix_arr", /* Cyrix ARRs (= MTRRs) */ + // "centaur_mcr", /* Centaur MCRs (= MTRRs) */ + // "k8", /* "" Opteron, Athlon64 */ + // "", /* "" Athlon */ + // "", /* "" P3 */ + // "", /* "" P4 */ + // "constant_tsc", /* TSC ticks at a constant rate */ + // "up", /* SMP kernel running on UP */ + // "art", /* Always running timer (ART) */ + // "arch_perfmon", /* Intel Architectural PerfMon */ + // "pebs", /* Precise-Event Based Sampling */ + // "bts", /* Branch Trace Store */ + // "", /* "" syscall in IA32 userspace */ + // "", /* "" sysenter in IA32 userspace */ + // "rep_good", /* REP microcode works well */ + // "", /* Reserved */ + // "", /* "" LFENCE synchronizes RDTSC */ + // "acc_power", /* AMD Accumulated Power Mechanism */ + // "nopl", /* The NOPL (0F 1F) instructions */ + // "", /* "" Always-present feature */ + // "xtopology", /* CPU topology enum extensions */ + // "tsc_reliable", /* TSC is known to be reliable */ + // "nonstop_tsc", /* TSC does not stop in C states */ + // "cpuid", /* CPU has CPUID instruction itself */ + // "extd_apicid", /* Extended APICID (8 bits) */ + // "amd_dcm", /* AMD multi-node processor */ + // "aperfmperf", /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ + // "rapl", /* AMD/Hygon RAPL interface */ + // "nonstop_tsc_s3", /* TSC doesn't stop in S3 state */ + // "tsc_known_freq" /* TSC has known frequency */ + // }; + // print_features(ecx, features, 32); + // } + // } // EAX=0x00000006 + + if (eax_max >= 0x00000007) { + // EAX=0x00000007, ECX=0x00000000: Extended Features + __asm__("mov $0x00000007 , %eax\n\t"); + __asm__("mov $0x00000000 , %ecx\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%eax, %0\n\t":"=r" (ecx_max)); //gives maximum ECX value __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags - + // Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 { std::string features[] = { "fsgsbase", /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -194,13 +243,12 @@ void getFeatures() { "avx512bw", /* AVX-512 BW (Byte/Word granular) Instructions */ "avx512vl" /* AVX-512 VL (128/256 Vector Length) Extensions */ }; - printf("INTEL 0x00000007:0 (EBX)"); print_features(ebx, features, 32); } - + // Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 { - std::string features[] = { "prefetchwt1", + std::string features[] = { "prefetchwt1", "avx512vbmi", /* AVX512 Vector Bit Manipulation instructions*/ "umip", /* User Mode Instruction Protection */ "pku", /* Protection Keys for Userspace */ @@ -223,7 +271,7 @@ void getFeatures() { "", /* Reserved */ "", /* Reserved */ "rdpid", /* RDPID instruction */ - "keylocker", + "keylocker", "bus_lock_detect", /* Bus Lock detect */ "cldemote", /* CLDEMOTE instruction */ "", /* Reserved */ @@ -233,10 +281,9 @@ void getFeatures() { "sgx_lc", /* Software Guard Extensions Launch Control */ "pks" }; - printf("INTEL 0x00000007:0 (ECX)"); print_features(ecx, features, 32); } - + // Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 { std::string features[] = { "", /* Reserved */ @@ -272,19 +319,16 @@ void getFeatures() { "", /* "" IA32_CORE_CAPABILITIES MSR */ "ssbd" /* "" Speculative Store Bypass Disable */ }; - printf("INTEL 0x00000007:0 (EDX)"); print_features(edx, features, 32); } - printf("\necx_max:%d\n", ecx_max); - - if (ecx_max >= 1) { - // EAX=7, ECX=1: Extended Features - __asm__("mov $0x7 , %eax\n\t"); - __asm__("mov $0x1 , %ecx\n\t"); + if (ecx_max >= 0x00000001) { + // EAX=0x00000007, ECX=0x00000001: Extended Features + __asm__("mov $0x00000007 , %eax\n\t"); + __asm__("mov $0x00000001 , %ecx\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags - + // Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 { std::string features[] = { "", /* Reserved */ @@ -320,27 +364,71 @@ void getFeatures() { "", /* Reserved */ "" /* Reserved */ }; - printf("INTEL 0x00000007:1 (EAX)"); print_features(eax, features, 32); - } - } // ECX=1 - } // EAX=7 + } + } // ECX=0x00000001 + } // EAX=0x00000007 + + if (eax_max >= 0x0000000d) { + // EAX=0x0000000d, ECX=0x00000001: Extended Features + __asm__("mov $0x0000000d , %eax\n\t"); + __asm__("mov $0x00000001 , %ecx\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + + // Intel-defined CPU features, CPUID level 0x0000000d:1 (EAX), word 10 + { + std::string features[] = { "xsaveopt", /* XSAVEOPT instruction */ + "xsavec", /* XSAVEC instruction */ + "xgetbv1", /* XGETBV with ECX = 1 instruction */ + "xsaves", /* XSAVES/XRSTORS instructions */ + "xfd", /* "" eXtended Feature Disabling */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(eax, features, 32); + } + } // EAX=0x0000000d - // EAX=0: largest value that EAX can be set to before calling CPUID + // EAX=0x80000000: largest value that EAX can be set to before calling CPUID __asm__("mov $0x80000000, %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%eax, %0\n\t":"=r" (eax_max)); - printf("\neax_max:%d\n", eax_max); - - if (eax_max >= 1) { - - // EAX=80000001h: Processor Info and Feature Bits + if (eax_max >= 0x80000001) { + + // EAX=80000001: Processor Info and Feature Bits __asm__("mov $0x80000001 , %eax\n\t"); __asm__("cpuid\n\t"); __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //feature flags __asm__("mov %%edx, %0\n\t":"=r" (edx)); //feature flags - + // AMD-defined CPU features, CPUID level 0x80000001 (EDX), word 1 // Don't duplicate feature flags which are redundant with Intel! { @@ -379,7 +467,7 @@ void getFeatures() { }; print_features(edx, features, 32); } - + // AMD-defined CPU features, CPUID level 0x80000001 (ECX), word 6 { std::string features[] = { "lahf_lm", /* LAHF/SAHF in long mode */ @@ -410,65 +498,207 @@ void getFeatures() { "", /* Reserved */ "bpext", /* Data breakpoint extension */ "ptsc", /* Performance time-stamp counter */ - "perfctr_llc", /* Last Level Cache performance counter extensions */ + "perfctr_l2", /* Last Level Cache performance counter extensions */ "mwaitx", /* MWAIT extension (MONITORX/MWAITX instructions) */ "", /* Reserved */ "" /* Reserved */ - + }; print_features(ecx, features, 32); } - } // EAX=1 + } // EAX=0x80000001 - if (eax_max >=7) { - // EAX=7, ECX=0: Extended Features - __asm__("mov $0x7 , %eax\n\t"); - __asm__("mov $0x0 , %ecx\n\t"); + if (eax_max >=0x80000007) { + // EAX=0x80000007: Extended Features + __asm__("mov $0x80000007 , %eax\n\t"); __asm__("cpuid\n\t"); - __asm__("mov %%eax, %0\n\t":"=r" (ecx_max)); //gives maximum ECX value + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 + { + std::string features[] = { "overflow_recov", /* MCA overflow recovery support */ + "succor", /* Uncorrectable error containment and recovery */ + "", /* Reserved */ + "smca", /* Scalable MCA */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(ebx, features, 32); + } + } // EAX=0x80000007 + + if (eax_max >=0x80000008) { + // EAX=0x80000008: Extended Features + __asm__("mov $0x80000008 , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 18 + { + std::string features[] = { "clzero", /* CLZERO instruction */ + "irperf", /* Instructions Retired Count */ + "xsaveerptr", /* Always save/restore FP error pointers */ + "", /* Reserved */ + "rdpru", /* Read processor register at user level */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "wbnoinvd", /* WBNOINVD instruction */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" Indirect Branch Prediction Barrier */ + "", /* Reserved */ + "", /* "" Indirect Branch Restricted Speculation */ + "", /* "" Single Thread Indirect Branch Predictors */ + "", /* Reserved */ + "", /* "" Single Thread Indirect Branch Predictors always-on preferred */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "amd_ppin", /* Protected Processor Inventory Number */ + "", /* "" Speculative Store Bypass Disable */ + "virt_ssbd", /* Virtualized Speculative Store Bypass Disable */ + "", /* "" Speculative Store Bypass is fixed in hardware. */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(ebx, features, 32); + } + } // EAX=0x80000008 + + if (eax_max >=0x8000000a) { + // EAX=0x8000000a: Extended Features + __asm__("mov $0x8000000a , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags + __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags + __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags + __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags + + // AMD-defined CPU features, CPUID level 0x8000000a (EDX), word 15 + { + std::string features[] = { "npt", /* Nested Page Table support */ + "lbrv", /* LBR Virtualization support */ + "svm_lock", /* "svm_lock" SVM locking MSR */ + "nrip_save", /* "nrip_save" SVM next_rip save */ + "tsc_scale", /* "tsc_scale" TSC scaling support */ + "vmcb_clean", /* "vmcb_clean" VMCB clean bits support */ + "flushbyasid", /* flush-by-ASID support */ + "decodeassists", /* Decode Assists support */ + "", /* Reserved */ + "", /* Reserved */ + "pausefilter", /* filtered pause intercept */ + "", /* Reserved */ + "pfthreshold", /* pause filter threshold */ + "avic", /* Virtual Interrupt Controller */ + "", /* Reserved */ + "v_vmsave_vmload", /* Virtual VMSAVE VMLOAD */ + "vgif", /* Virtual GIF */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "v_spec_ctrl", /* Virtual SPEC_CTRL */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" SVME addr check */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ + }; + print_features(edx, features, 32); + } + } // EAX=0x8000000a + + if (eax_max >=0x8000001f) { + // EAX=0x8000001f: Extended Features + __asm__("mov $0x8000001f , %eax\n\t"); + __asm__("cpuid\n\t"); + __asm__("mov %%eax, %0\n\t":"=r" (eax)); //extended feature flags __asm__("mov %%ebx, %0\n\t":"=r" (ebx)); //extended feature flags __asm__("mov %%ecx, %0\n\t":"=r" (ecx)); //extended feature flags __asm__("mov %%edx, %0\n\t":"=r" (edx)); //extended feature flags - - //THIS IS NOT YET WORKING - + + // AMD-defined CPU features, CPUID level 0x8000001f (EAX), word 19 { - std::string features[] = { "cxmmx", /* Cyrix MMX extensions */ - "k6_mtrr", /* AMD K6 nonstandard MTRRs */ - "cyrix_arr", /* Cyrix ARRs (= MTRRs) */ - "centaur_mcr", /* Centaur MCRs (= MTRRs) */ - "k8", /* "" Opteron, Athlon64 */ - "", /* "" Athlon */ - "", /* "" P3 */ - "", /* "" P4 */ - "constant_tsc", /* TSC ticks at a constant rate */ - "up", /* SMP kernel running on UP */ - "art", /* Always running timer (ART) */ - "arch_perfmon", /* Intel Architectural PerfMon */ - "pebs", /* Precise-Event Based Sampling */ - "bts", /* Branch Trace Store */ - "", /* "" syscall in IA32 userspace */ - "", /* "" sysenter in IA32 userspace */ - "rep_good", /* REP microcode works well */ - "", /* Reserved */ - "", /* "" LFENCE synchronizes RDTSC */ - "acc_power", /* AMD Accumulated Power Mechanism */ - "nopl", /* The NOPL (0F 1F) instructions */ - "", /* "" Always-present feature */ - "xtopology", /* CPU topology enum extensions */ - "tsc_reliable", /* TSC is known to be reliable */ - "nonstop_tsc", /* TSC does not stop in C states */ - "cpuid", /* CPU has CPUID instruction itself */ - "extd_apicid", /* Extended APICID (8 bits) */ - "amd_dcm", /* AMD multi-node processor */ - "aperfmperf", /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ - "rapl", /* AMD/Hygon RAPL interface */ - "nonstop_tsc_s3", /* TSC doesn't stop in S3 state */ - "tsc_known_freq" /* TSC has known frequency */ + std::string features[] = { "sme", /* AMD Secure Memory Encryption */ + "sev", /* AMD Secure Encrypted Virtualization */ + "", /* "" VM Page Flush MSR is supported */ + "sev_es", /* AMD Secure Encrypted Virtualization - Encrypted State */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* "" AMD hardware-enforced cache coherency */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "", /* Reserved */ + "" /* Reserved */ }; - //print_features(eax, features, 32); + print_features(eax, features, 32); } - } // EAX=7 + } // EAX=0x8000001f printf("\n"); } @@ -476,6 +706,6 @@ void getFeatures() { int main(){ getVendorID(); getProcInfo(); - getFeatures(); + getFeatures(); return 0; } From 0ffa60bc5c9ae790cb553c67e66774f7657ada91 Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Wed, 2 Feb 2022 13:48:12 +0100 Subject: [PATCH 159/174] Fixed broken CMake message --- cmake/AddCXXCompileOptions.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/AddCXXCompileOptions.cmake b/cmake/AddCXXCompileOptions.cmake index effb7abfcc..57cab1f783 100644 --- a/cmake/AddCXXCompileOptions.cmake +++ b/cmake/AddCXXCompileOptions.cmake @@ -17,8 +17,7 @@ if(NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION OR NOT DEFINED CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION) - message(WARNING "Update your CMake installation! We fall back to - compiler options back ported from CMake 3.17.5") + message(WARNING "Update your CMake installation! We fall back to compiler options back ported from CMake 3.17.5") if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xAppleClang") From 304770e37d79bb99c8d94512056de24bf7b710fc Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Wed, 2 Feb 2022 13:53:33 +0100 Subject: [PATCH 160/174] [ci skip] Made OFA compatible to CMake 3.2 --- cmake/ofa/HandleX86Options.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index fd6d0ad87f..0ddd97a45e 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -403,14 +403,14 @@ macro(OFA_HandleX86Options) string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) - list(PREPEND _skip_check FALSE) + list(INSERT _skip_check 0 FALSE) else() - list(PREPEND _skip_check TRUE) + list(INSERT _skip_check 0 TRUE) endif() continue() elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block - list(POP_FRONT _skip_check) + list(REMOVE_AT _skip_check 0) continue() elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block @@ -418,16 +418,16 @@ macro(OFA_HandleX86Options) string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) - list(PREPEND _skip_check TRUE) + list(INSERT _skip_check 0 TRUE) else() # Compiler was not found in the list, so we keep its previous status list(GET _skip_check 0 _skip) - list(PREPEND _skip_check ${_skip}) + list(INSERT _skip_check 0 ${_skip}) endif() continue() elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block - list(POP_FRONT _skip_check) + list(REMOVE_AT _skip_check 0) continue() endif() From c3d97907f6300566e905a16a9f97eeaefa7d751d Mon Sep 17 00:00:00 2001 From: Matthias Moller Date: Wed, 2 Feb 2022 13:58:02 +0100 Subject: [PATCH 161/174] [ci skip] Run OFA for CMake >= 3.2.0 only --- cmake/gsConfig.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/gsConfig.cmake b/cmake/gsConfig.cmake index 3ffd7de676..49445acb6d 100644 --- a/cmake/gsConfig.cmake +++ b/cmake/gsConfig.cmake @@ -267,7 +267,7 @@ endif() #string(TOUPPER ${CMAKE_BUILD_TYPE} TEMP) #message(STATUS "Using compilation flags: ${CMAKE_CXX_FLAGS}, ${CMAKE_CXX_FLAGS_${TEMP}}") -if("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease") +if("x${CMAKE_BUILD_TYPE}" STREQUAL "xRelease" AND ${CMAKE_VERSION} VERSION_GREATER "3.1.0") include( OptimizeForArchitecture ) OptimizeForArchitecture() foreach (flag ${OFA_ARCHITECTURE_FLAGS}) From 4917e78189815f477219dc8d07cd67a6ebce78ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 14:00:17 +0100 Subject: [PATCH 162/174] [ci skip] Made OFA compatible to CMake 3.2 --- cmake/ofa/HandleArmOptions.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index ed3d0e262e..b1866b9c89 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -825,14 +825,14 @@ macro(OFA_HandleArmOptions) string(REPLACE "," ";" _push_enable_list "${_push_enable_list}") _ofa_find(_push_enable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) - list(PREPEND _skip_check FALSE) + list(INSERT _skip_check 0 FALSE) else() - list(PREPEND _skip_check TRUE) + list(INSERT _skip_check 0 TRUE) endif() continue() elseif ("${_check}" MATCHES "^pop_enable" ) # End enable block - list(POP_FRONT _skip_check) + list(REMOVE_AT _skip_check 0) continue() elseif ("${_check}" MATCHES "^push_disable" ) # Start disable block @@ -840,16 +840,16 @@ macro(OFA_HandleArmOptions) string(REPLACE "," ";" _push_disable_list "${_push_disable_list}") _ofa_find(_push_disable_list "${CMAKE_CXX_COMPILER_ID}" _found) if(_found) - list(PREPEND _skip_check TRUE) + list(INSERT _skip_check 0 TRUE) else() # Compiler was not found in the list, so we keep its previous status list(GET _skip_check 0 _skip) - list(PREPEND _skip_check ${_skip}) + list(INSERT _skip_check 0 ${_skip}) endif() continue() elseif ("${_check}" MATCHES "^pop_disable" ) # End disable block - list(POP_FRONT _skip_check) + list(REMOVE_AT _skip_check 0) continue() endif() From a0f778bfb012aae9dd7af15cacb31f17b7051ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 16:37:12 +0100 Subject: [PATCH 163/174] [ci skip] Updated performance benchmark documentation --- doc/figs/performance_benchmark_memcopy1.pdf | Bin 40795 -> 38886 bytes doc/figs/performance_benchmark_memcopy2.pdf | Bin 40605 -> 38953 bytes doc/figs/performance_benchmark_memcopy3.pdf | Bin 0 -> 54949 bytes doc/performance_benchmark.dox | 207 ++++++++++++-------- examples/performance_benchmark.cpp | 38 ++-- 5 files changed, 152 insertions(+), 93 deletions(-) create mode 100644 doc/figs/performance_benchmark_memcopy3.pdf diff --git a/doc/figs/performance_benchmark_memcopy1.pdf b/doc/figs/performance_benchmark_memcopy1.pdf index afd330a5b5df9e356b05c377d2f631d24550f8ee..6084c06fa1dfd75fec60d0c476db62711e217b8e 100644 GIT binary patch delta 36868 zcmV(}K+wP2z5?d20+1vEFf)_Eo+yz{2!G$Nm`4)wwEKPupofrFh&XU!Su_$r(F0A= z6an5$(U$f1In~`W(>>F(E7AK9SfIGv+Os`%s;cYKGrYU%cz3elJ6;8T^GF6 zvGnlGAem61Gma_mcIuvfCMjpmkS*xtWYZhPjFG16IWs{9)!|yk2gbF5Qls>yo1jLp zV1)}cbd;7*!8J3&8e3y0aRmk=?u3#v@F*_9k_Tp`<-P__Vu0UtquWQfL|cduWHv^! z1(|{@-Al!^l4;c_5P^Y|juz0KS${8tu3?k3#0GfBO6h9sBrUN*Td1$_rUDn@3LP|a zQp0#PdJ>mtse$3f$r@dVD>Pz@dy7b^(UZ7DD~%|!E`%Cgh$}Rlq;;5^uMk(Z#2LeF-k5C3tjw1umooc=CJ+E~O=SbbJLaq@|5VzgI>s zq?JPkx7Qw-qy>2LdI>J21^CM{N;A3kIxxir54)4=1l^rnC+ue=*8;pdxlZ6Jxi*SK z%)|(}pln?_8dajgmh@UxqJIKc{90wAMwbLzb)rIVFCUXX$NX?xTCEDJ=~8WRm17z! zL;61I@Jrn`2+OoY+SZy`>#6)>cY$G-U;6s(tcRHTgmFP1&>ZQU;j<= zE|A`~JN>!Km<&0+=-wWP!(J(M@Qtg}db(Ivet*)v+(lz+CYdy0 zj|_#>)c$OzIDkryc8%i$R8&kQPg+Et!4c+N93*v?a--gpf-ioDoMveg2` z9Jv;as$a{s8IvJZ9$M0@!Shs`^`F#lvb357nn#ReHi&(&M1NZ$6w$<hl6n5}*2zr(BBXt{lAZjJk zPNyaT&03pi24zf{hfbC>Yw(ZHWuRo3PQIspra@OoIH?RB-QeRTj_wGUd|l}k#ESdodfYpeSSU=k z*2d$9g+&mwYPhcQ*m>G0*We37plQvvqf{$x%PfXD#EeZa=c&3i&B6?~CSBD?x+CHn zB#`-x3yJGdb*eLN9r!!_@k6{vl;a>q0osw^jDKoHfi+yDizB}qUlui_)5lAI&7K!4Y3T_Z;zUf~&+{HX#N*0ro+EJO5*y!n+! zW@+#+2J?{3dqI20jN~o5pe^M_s~jDc0VRDaI-Co62QH&yxQiT5_cw-Y78<4)-luE9 zV73_=E=Zo7E~qX#CrvA=;cT6(FATNVO*2R;$=b>6c1pv@|BdC)&eHv;-B=HC9jsA#uVXTl+Q^1Cp*ixO z(RpJbqNHt^1voC{P#I99EpO&<1d-2s;i!FcXKagNQFAny8j(**Qb*O7ZPkubT29WV zz8)UXE9ZI{G+<(Db#!)j@$vfN>3@Bi!mjV=Rbv|a`orSjw*GqEa%B?z#G-kiDbR>$ zw2=~a((ajUq@Y2{q$9rI+}d$%LJgqE_Pfj5`^Qg->iPY{p>zlN-G8~^cV|!6za2_{ zaG7Ms$rfjik7p$P1WX>v-#%WR-3<2mr8+sI`lDhS8E3q=?|65C55Mi=qkj_J#|XWR zaT4&?P4{N=x_M%UcThZd(^5DI8{QNvRhhSgC;OprjnyVag)f>wxuL+%Q`w5R! zLlZ7ukVmLNlRbM4$UhB~T{mmSZV%~G1A>Q9eY3}^mdzRR7+XR9($ z4OlsM8BhZ##>VM;Y|o1iFU8^(;tXAfMpjzZ_bIBg(nPu!GRAN1lz*N5{-4Sz=$f%D zcQ_@3DyPU*r%X#e-zjdsQ=Zp;|6ln^{ZU_G%Hm;%s}iR2lwS4JxM15+>KVId)(QW) zHjDAdKCgj6=S2JILDABx>FLwy z;nU@ls>4%Fy|c`stbcYWqv)Q|BS=q&IC)5TQ{S*)aa=HTKkm!UzUNQq(j6y9BMKS; zyLPmiF&Pqla`NKM?R{!_B&H_G_*bbve06;&4zAEMii5lKIT9J;Q(yVlhliWXG@^MgFn6s-m0KhJxyn$n4^TgA)`hThPh5nm= ze)H<+$CGrW{y*mv+5YGIhs(R$hy3pL{NdM^zoBD9>zX7NpYG0Xug@dV`0Wo*7cW6q zh+%>6|9%|z!5ZMn^4@0Ur}-_S7&BAI%mRAXBi;W4uZ<3UllBTI5-~V53NK7$ZfA68 zG9WQGH841nFbfp~I5;vlle}3clW+?Sf31ma+v(W0oe3t^1QUB=&%~P8wrx%DWuJZC zGyA;%U-f;}Ro&0Ea4pXw(Af1uCT8sVVJGik50G)Q1+cIISU7oEf4F#= znE|ZK%sl^N=-|u?5HohSG6Tpn0%RQQfi7?qq7IH;&Q=zdt{-9k=Mg|{N&{fw;o+kH zmpee%4(M!UYHSaXH+HoI+I>VcHMRw)I+$7kUA_K`f|}pb)zy)giOIvmgVET|h0(#; z;u8%$z{ASb5}*Qf0Xn+_%>aKDe+*DCwgdiM86zA8K+V$1|xA+WVF z1=_oOc(~b{0i6LK!2zn$asWj~p#9&*a(^4p1OD9`01G3_f9U@0{TGpy{lA=zO-&u_ z9F6V0tn4iS=2o^qfTDyPqpPPYJ;2!B>@P!OTNj59e`9xJD_die4}*VYe{Kwr5LN~l ze-!xd@?1=vtsGrl7+tJv|0o3x${A?-(RMGUzfac2 z-oeA(=ik8G%HGWUuOiIc9GTSZt(@F|(qjKM`4GYVky!v;0UXTC%v?Nd0H6~9=xJ)n z^cTIFmm~0BN|wLG9~Jode>gfg0?a>(0Qy;(13!M?d|Zs(fdE%$H=v)-KNbIt;8<7y zW>%)I0282vl|9@a_75@8{BQhm_|8_I0A1#f*<%4P|MmILlm5qenK{_odi^o~*ZDFD zYe*}}h|v9A@qct8A`YGa9|m@200S!rGl1o;aZ@cVBXC1b09f2Z*eUuk=D2LR9C z*?zR?e=>Ie_W)4;dpT$T|IMY~@G-SO0QH|C*JI{jHvRZw`F|euf4ThsP556d|F$+1ma~P5m$Z|3__XXJza4Z-bAab#wh#0C|UxHL(BRrkcRNmsTEVX60u0zqQh? z#vh9yY;R%vpEg>#e@Iw)0?m}HTum+io|eDm>VIvSt(84c$-%|yuWJRs@NqQ%OZTx_ zrZyi}hs(!E{!0b?*qr|kDQ<7-VD{IFv2t(%jGdj0z2H6$`2%qPd{{nq(G2MMuOS97 zG1@!0ez*WW%JT!5J2=Drb)uXcA9M4U=x@XcU=sO*xByI|e}51UfJyv6h>HclB>4w@ zu*m*FAAa)xLEOv$CZ#`!4Zx)O2eAW~)c&9k7WMxio(~q|KZqN^Wby}nL^p9ZHnjnM z+}Y-?f8=cck^en+|51D-Z~7m^@j+(lVEfVY|B|0S`r`xEf5!(cM|2mA;L{KN3k zDByoZ;QUMPf8_Qt!2hxNNXFdC{f`sNUxI(=SwEnIoAWB~#1AZiB`49Nf9;<)A zk0NaU0Y56V`vX620H#0XaeVaB{$rm0q5nwN;ZOM=&JO>j{>aVokNShm@#8gN|6k2! zXZg4Ezgx}z(Hh5(#di49Job;e9BtiP{=xrI+CK_sfB7iwkCx*@>GE;={m1#}cf_ghgqDe4eNI{%pd;Fnl-qy% zz8HQ{nc8cITb#rim?%6EuJ0#=C16k!+V_2R^3@2k0dEEUDMOLx;GEVSa5y3wmEyjg^4kmaspU|pI(X$%zlW%m^B7f9nUexKkgk!L)X?LuZ| z9d5{8&{2YOjy&J1IKQgKONAesv#1NUe;#Q1A@3oa3hyRA(=IOyaV^=zjk3e3^RvXe zqw?3|sg(KZfQ_>ug@>}1Jh1fF-sx{6;s=*y+WD5#2-%09`8Bme?qge}SHj-c0^sBLf1Cks`-PHZVJ9&2W%*3p>v^FYRY!p^ zjg8;GX9*7D8VTNTrJi8H8;<5d>Zbg-;r3MGXJ?rsyb*Anc$5NJMn>f0%xG=#bCz{u zIb!>rsV)ivX_bl9Amv(BH#OsT?}bA&|K@K>M4`9gbP=f6*0#^Cf5`4mqMK#-Al>%u z`$c`t3+BMSX$&6o24Tguyv&n)t~PV!XuP`*=LSylyxwCk0L(epX*iRV4Oid+Jo&5I zw6-kcsw~d&Fikf-w&aMck8z}^j)-JVCma!>*9tXKv@BJ4FRLQ+j7C~Dr?e6|x$W52M|gimWJZU*>k8~dr+1!oXmW$$&vJaEaf@I5g4LM z+@&6U?T~)4e^hvGnD^OLL<1nDvw+LFo6gx`LRvKzcTo#}Q47*u0xFRdCI%!OB$V4nWziuKf3>?$4V!w`Q1cSZMEEU0*4IHE z-Vl9$CQTD{e?SG#wv$FtOUY;vAkoFZ=Ion` ze_33XJ(BmAZW!pQf`dZdc>=i{Z(RL2)XQ>b_~@c<Lz!g(=X>*jg3Zm+qr z2)k94e?@Q-577&f9TG4 zmPns(KJi`B2Y3%ROi{o&wAXCpd4dlyr@I8B&JW?DX|hOoPMpskrfmQ$-t4}_HhOA! z-8C+wPJJcSC-e5lNh@)5zwSo-N*Mkn<9@Nfl+js)CyE@?zC~NKN|fcf&$Zl4i0q-1 z8)VW(A!0U0_B&|a3|~Rl@obW9e~`p8H5B~fXN8;SVo<)#ts=9x%AVh6povu|FB`NS z(fk9&`6)a7=%MK4;4WRoJp7H4@ra`0MqwI9-z+yfhx;fnm6OILiX475i`o%N&$n>n5}@kh-%fAdK3~qrhIGv#OMRxQYy1SN zqbxLzEWNbpN>Q-;o|@gITS~*#NBQF=&vM-UD?%;qkv7%rig8bqTJIH=mUF=DMh26; zp?wOtj9fkzJPdWJ5M13s-ml3_xT;14LLXU)#*LLr1Dh<9^&b0Ue}AkqBe|LaeV7&F zLiK#m)_$LtNHq8D3%QNew*7DHNw+NNiX86^x5$X!N-DD&$7|fYRSg)`el4{Y&{I ze(Ofb=O<#6ngoGE<-7CBTCLt#7zO=x7NE);r!^R|uI} zYoTKd&x-mGacXAC5x(&@%oSxPu(-0pWE(ZA*2^3ez8wApCw%+GUAG`tTRBmdB&|LG z#~5B3f1J~og+8b0q9DQDm`QIIQB}Gwf&wL$UVkFsFqkUq7(82rOsf3yNN9>J_0>!x zISf%7a9DYd$<9%MLzW9v^A#BFOVhiuxNaORm0Go_(>hbb?iQ==_p`t6p!c>E3PT+* zMe-|EJLU8k=y#TY$kM2^K7ODo6w3pE>}IA7 z)e>n`S2CFW6tz`OV9i10b)YEm<}<$w$Nk5lf&CQ8tCG4T2RDCd3h#xVAAx zjR*Ib{pAz<*WQrkVKFJ|h;Jx6E||`k!JezBaQlospOz=*YijZ*Y|^TiZTt8rQ)|fh ze-D;3zh46rXw0W}CA_nYng^Q&zwJnyf(WTfQKs_0Lv|#dLjc|FLAcR_zDfiW=I`U_wFm9q$-@Ru$jh{ z4S@_88Nvak(cwq74w|41^3>PyP|hIDC1fi`3zPQRKGrKnY`(T&?u6AwZB){>e|fI~ zo3$i+4KMk3GfF)pr#{Kwpu}s^F9gONjSmgrgBC4M6m7tbzBZp{?KnO@BUEv_d99K= zU#G6!q`1ME`Cb(A#((TOMN+4)0{!8dN3(CmD|MasLouvxg}T}h zs2FraD53G=&1n(}>rWT(NKROml#U)ny!x5@ViyPz;-vSE0nX+x zkytN^Kn1n!mL@g zDSOYyQ2|xxF?iAru`BKDbRk^IW-c7Op^a+gRvFl{w%dE%IdyS5ji&94azrS0ch4Ii zyKF`Ttx^QO21pWiMMXGae<)9&$DrA6GcGS2ZnNB;;L)8eaeIeRJ;33SO~egF?NgI2 zGu9&C^{^ODYElf*-3IfF!FL)ADe!`jrcQnDutwpKD=C-mGHvwpwIu2y|KcsSjjle? zli<;=9ZT2wTB*Z3esRz@QP`i&^7FdUHedW;`0jX!^&fcuql&2VOVisWA~gLcaID)wP^)C^nj( z^~m>wVWHmz(R+UV0iMYF7n)6w&>4I75iW7*%=WZMK1emTf;wg1H!&$AXfdB!4!F;V z9U0`ND2EDugI~xRe|-WNHI!2H7^QwYU;8=iQZCr${;Pe!K<_1rHW73FRe5BqRKdhN zjBcgl$9iav11W6mQi(cjhxgQt#j(Yh@orp#TZ2U2R0e-9y4JJTT1$E5B-k@;2iTCs z{X4s$7OQ?w0(+@c=GeEah3ia%?nx&(phI1Dy879TKf*&%e`4Y!*QZmOfx?>%Df@&R zu(r}FVJ^}mAF*&1ekNAeSNZCl&owZuNmMbO>*b^9q(;keICG#|Ifr zI@2M0KSMVDMJ=xxN#fyI?*Q-h!C*ECoDdHjo-YHo%lB9XAg0$Bj40(c%lWR{|2mmY z^;MZ|$b`?6tR;?^?S@!&l(*d41odo%qQzC_3pCTGk$lDX^K3$AjrE`&Tu=S)txz4G zXtpnMf4dv?18u)Pfpa+gK+!$OpOoN3p;w?Rr9!tVi`J`EQ0|t#mYN;Aoh0rORO3Ap zT9mlJK2UgvhgQ|hwv1n4T&!0EqcKp0oMIpfv-j|O zrH7ByQD_mzoD>DUt$BdghL2T&1U7p2%-&zJe|1dj!8I+qp1r+kZ*dsKV@voOW_fnU zhJdT*_{LCaB{mL5gh~n_tIwlwO2g|BXM`TC-hX=G9o8`X4L1B~u9$~bn%NI)rcsa7 zXAu+Tf=zwdd*Eu=ZC4`O9DH=CFzilgqjZEuJRy=BIrv4rDErxiruxzXJ^B}>(=6|% zf0;E|QESaAW~3#w(;$3Nc87}tB15ZCBAhI^Bvtn!UxW8(U{d;e^b?OxT7-eJ@m4~zM!zz4Pg^08tVxO*>LGE zS<0aex~uE_>fKtr1Q5Z?D}ttU_Y|A8uxASGt0Jl#JmHTa-Swcc`muaQSzqqAk%@o2|_WEpT5)-;P$_mpDBs)frXsY3d&`c|K@ z>nEp2#V=SFC#96Qy0L4B&a`(mhZ*W5hS9|3zbxoy4v9o<=aQp==wGo?@GH0OZ~O0q zTN-Q;?HrAseh(-ZBwHTzF)McTf0zm-1hIOyIwp32;c-qHQ}?m={^iH1s(&M(WVmyN z6%^H|2Ix>$b5t%;P|+=keo%BMeAQIsEN3(u*s8)4A~{${I}|p+tHwT%g|A)}ZRVqJ zT^*edYiEMoSEXyw95B(vs~53}j3=|xTw&5tst?%Eq&E>L#QPm+l|`zie`;A{dohnC z5rER)CpxV_DE*__DC?It6RvcO+K4KUH5YO{wTmZ>aq+-2=P4?Amu0P>37Hiodbeg5 z+`mHUrSTiP0@LCc$oHLNRj8j9DEP!m$R`6vkog&zDUB_K414Y9p%EdTR|B-M(~-rl ze85j^CM<=vS-K>u-_u43f5=ccwy{=Wt>np}$PGeoXPqR*8i!b$8gr6aO|JJZ>d5N! zur9wLQQi4~F(u3dFY9~{P+L^f=+aDk@nlR zIZ93ZKITdc1cn-$J05RxWkvMtaA_@OAjTcySA$nM*0^e6k5F&9e@vH@aWclOp$dyb zP$};hEx)FrM#_BkhNJOnpO--9VFSW`F2R1nJSA-u6nK;HcbICVA^LS+Tj@>F1-5=N zJc=w6OF|rFzJy3JbuieNI1jlXJ%Z93dd2Mi)omUZ?EM$}vitg+M*dz##Acz1w#;vI z+9!Q_@yws6uZ{`0e_j9v-d37=+EZA`snZs^gx?TXuJ&aK0!uy7Fep5l?XQ$s^he!3 zrUY0te6^SJYwHZANJcA7=P&urjPS&G?l^s085T&+eN%83DZix|3@_3ShaO1OG5A(5 zHqy~#%S+tEKnqYM`Dy&40Z8HGDB`kD9HKf^*iuoGSQpo@e{l;Kp@(vyHGL9(OD>e# zo-UX_&?NYEb$qyX{8T{hN<%fW2vtUt&NWTIULimNp?P8$8qfJO{MLL^)OU#UgzsYa zxcAMzR3(zK)Img8(?!zZzEZktPbTs~Ar+q&4U1(JD4n%Jb7>d4x9HhxGCu|$gQnA$U`j}p%BQ6GofRjKt*h!7)+ACu`PNwj_$`ut@B zOcmQ8f3hff>s%M;D6=a0sBW-f@>rUDc(M%Mgk+gg*QwUdMdZ;ltN0s5v(M4>@I}1B zd$-wwuFZM}kkC)P+i9sv~4 zJbgK@pN@hapEPw`x$RCm_N&dc1@}uE*qK{=e?qMa-onhpD@!8aDl=Jse_Rf|f&8ZW z1P2g3>k<{WRgl)UGP>#e^y3CGJbQ4n@CQTDMo(3l9M!B@D@_{)Bz zCa2yvcq85JQ-mj+pk+x-X1E4!mnHU*45&oMlhW5pE7Twjj`u0?$mn4jR$3Syzh8|9 zf5(ywd*`8egbwPaLP$+R?m5We`0X_Uyn)SK+Wx$R7)e+eqg6RC62~N@MZb~F6niAx zI0-j}ly;7(vaS(cUL3ii(FIpj$`#7Ua4sMj{Q5LSA8|DL98s(c)--Rwuh>|aWwrDbs|H4upg*{ZOGPEUU$f02iat1{23&%~9cp+soJ5Vk@CG$5=`mFvA% ztSq%=>&LS0E3qBSed_+WBiSq5dJ#{9+LfaNKaVv#Gt#&xzL; zE|Y8hqBca86pXX8xF*s(*3CJ zl_Tl7=;t)#OE?=%iM8!&_@ouNfrnyJG`i2^v-rMuAhCF7@>O#?N{TQDxD3b<3|!5O zwm+&{in0;h!-PuPF=p_U&kT=;GG}87 ze_^UTsG!)0<~mB$nnuT0_a}Rr{ta^`lI~FO1_SO}`E%0|99$?lR(QW4VebTahMoNe zj9w|ZWapti-r73>H*NY8D#qe>JvWC0F?I+mzMG)yluepmPTJ!EqOO<6T$4bL6AnCntl4 zq8jeQt+_=w+8SFwK|B}JlZ;+_{!HdNx^YV@kHKClTp+7TdA=9hipb{5-yym;Zi$uz zpIwy4_nFvzhMipLr}2zK>TpckWpyuY532oH3RR%GLyGU?6&ePve{^6bnQ5N5l4u%0 z+=Sq<)_?~g-cl0P2$|T>s32NV-BTZO1SuL0Qd{c1Lj>fHQ)IAoYEN}%`|Q3EwkPDk zPXc0I4Vu1!nSTlr?FaWM+JV8uL@^%2d1YIFa#*Tn@7h|NrguxZuS+F|bWS3)Me|Aw zLIC-kv;9E|TgoSVe>7U0cGt8U<)|!o?4I9SrT6#^bo`*?(Ef)Q!ImI~fCe}23Emo& zu*Z;ZpPSU!Pd|Yi!niMWAu&48s=nJ3`sKAM%Ci$0iz|{bo523aFQKI;D+#q&H!k_x@i_ z>16Q_2O)6folCM*+VO|mpOdENPQL}Nlw}Pk$&sCE49O|rg}b`&AV=!_5(+Db^+EPb zOF`Qk&T9@wnO?v|F*4-3j%OX!A>&tAbsBj1PLFq=Du^aE88Kb8fE~#Lt?2UfY7E1* z;^@t7{tHU*fB0_3oLI%ySWauc%H6trXR7DLi1=8r2W)82voc~Hp-wEf@~h%V1=i3P z8ZMB|6$K`}ck{#1$zuI>E-;IwiO}zb-2Cp%d%6uWLG9QB)c0!m@vqUjku{27O4#Qu zV6ao{ZZu9G{{)4+KS{7k&Mn1hY?LAsAdY-bp2$_*e}EOrIk#kd|84>0(Ixtf)w!dH(nAl(&smlqN2Wvh9~fo3-9M zQ*nb~36A?`BafjuM;^KnjT<3@@=ufg6?mL=auGJm`_bzqv>A3J(um;+lM+ zx^5h&TZ^smD)CXsb@a7HLd#^F-9Qu+ozYl1W6@wQrSyRim@0r(?jN-WSHx%Kva%rv zpFv(7>)i4yQOVc`e>s3}ZS%0AjFd=q`0Abkg3CYE@tMAcDXNEuZb65fJ?qfm*$v{W ze!ZZa=%RSp!1L3~3Y)X+397|elPBAu5)Br_ z*ak*F!V=`n$%ijB_Z36G^d!%~iOIyu*$=ufN0cRyj?B6kL{iAEyKzB1QB(&)a8R6J=cG`cL{ajl z^d&O_?;60?s4K6UkEd#b-(2>PM2cGBeWn~Eds)oxLv4%ri5rSEGohUkSX8zmEAU(K z*D8Kgjg@@l)YhU}Pqpl`ZYB!of8g2fpQ*l|560*4#p+_Ia?;FsSF8EpoLb}6k&66HggZ z1SyZ&dh?S>U|eq!c-t{AMF`LV&NVIxnlo1YB8$!D&htxd%M`N)+rLU^f2(&40YT}P zg$krs;Mki_`_9v5JffpG9*L)EuqN?zjiZniOFD#TCq96Pb5-l>}tVQa#(L1 z4_dhMM)FC?{tQuB5T>))@`sc0GH=|A_{!JZsYC%OXpVaG=yi*Xe{787V{;rs`dfzZ zCy`M)sS{!mF$ST~k%u_Cow!r)g#V*k%?mM;)-?RauwQnrlonM53fs=+wG%G`vP22S z^6Z8DvBI%)xT4}d>U|Fdij;r7vssW0tH@N>ikGQ8J~$AGassA55eAih3&AW%^jBn^ z1;rf?R(K#oB`mNOe`4*`{L5#Zx~7`fQn+J-HwuUoR1HirwirP;k(r7}>n~}v9IkGZ z?@sa~`2me5rcc}bAbFbzZ0KAnKIDAKI0HUvm}kdbdRLOrQ%GI4O=4gmwjCkxM+Bmq zVc4tqP?KpijBTo>4Wva&k8xn~uEO)>S5sr+Tihe5H)ryQf8Q{KVRO*Hm+y+Ibsjxx z)Zo>9_Gt_ zEUHP6$RCxCHLpiIn|C?H7ZCV`GBlYPQ~r`>>!2D{bu2rAbTqz&OH#-kK(ZG z@tsK4oW$H4e<{{(prrwm#dCP~o+IzKmK-Pa+TEkH8u|i~;ChKqe(D7=T6S`3*R{}9 z%f>yaXaOReeE$Lhh{Oned^?ijv!e~pM-es`}1uUA;TKK{D z(A-vQw9T#D;2D;BULnk;i6??_%C5imXYlWbvwk4#f3)RAQQuBkxZ3UVQ6BI8bXuv7 zQ%uA{@pc+?FX03>w76sbTW2uC7Bk=nE86h}PSD-0{^mLpbbTbYZIOdd;>;f!bzpaymYvR8WR7WBZBANjtW5MCl$2m{RbWd-fAzq5kD} z1Jskiwu^6YdN?I1R}J%CTD`(QyP4Kx#77LE$`6g>bFYmRIS3FHLqrL}1a6G+3nW4Z zc}P}_2O?wG)Q9D0w3j_e^hb6|a=z_|JicvYf1y)I<7uMZFMIvamEMo&DkdyRP`9ru z>pp#=9JC%+39Q?4&1C? zVM91q>bJG*4$}PgeGO`?K;cz2jC$r2R#;1N1%C+@w3}${Lzf>C(Ouj7YfK^=;l+y$ z(J7p_D)~?FknGt4q>3@UiTAG3G$G%pf8KdFBXW|aBHNE7*uonfG3Q03uwhnKt+*L5 zZ8Nt-`2>0=?}P*UjRU|vJC+ZmhS*9{$tNq3n4n{W$_idaH1yCL(#YbLPVu{9Q8YVE zoi5D{v3m+%Qq^60h0j+C#?-ensl{J&s)+_3W8OAye<<)x z$`ZhGuW^3U#Vf3+Ack1tq``??14uyKO*PGZ+L{3=SS{#nJ(tcT&!Fqt; zBFn-y8Oki&#nd{)ev;kMiRQ9Cax}2GAUHYZE6vf`K2$CkcB}Ja^vpdA9(doK+jNO* zubIjhifk4N&xN`YDWn76Jmo}!e_xj(H{Kpxs}TwxQTvK=bnVPSR4#i*#|Tj%cs1xI zmgL;)pD&=Zw^+g))QSx{K#j@An=^wUYS~2j*7iu?Sv`P2>_;TVx*vJ$uoSFWFL;uB zlVkU^g?ka41>wOfztr~yq~NM}gtB_JVSQbmc-JBU@+Uq(n6+ZoOe?`Se@d1#8)HJ& zxuj;KPSG)x+OH12C!<;O>p*Y!w_s^eP}7qupUJ|7Oqgl*fw*Th1h^ITmpbE1<4(HpCSl(ghVIs-) zWo*Rd1#Ev`oSiszm37r#e>oczfw-fy>m|Dd4|XTKxNw|cj^;wdMM!pSXjc$Hj^}{= z(Oi;wjLHCR(?aIE*1#1Rq@6)eG#%a~mGy>|0=7yb)6s6}7IK(6}L-afv>=(Vm@b2<5w@)SXyKLH480MuaN3e{pBYc$KQ&$KR75 z50uG#8yNMttQ&g{QJ1g^PU+?YyB!>zv!3S4_WU4zSIWBT8 znvFNALdX)9j*gfEsUyoNzF}~>kv~AZAsfa=buJFm@4sqqFon=u*miY+^vF;z!lAX3CO<9h$|S$2=Q=ZNzGG1RgmpoZ^OLMs$!H99wBsRb4G zMqXNESwuFcUu+XKtNnP$hBe-l8?aOYUXC37oOd*NLqrcX;X zmngN0YU9)s-GG5CvAOg%Uy9<1BW6oce;vG<-X8;JFV?S12Y(V#Id=PU*4T59T=wO~ z@I>CZlI$sxD%$5AtHCaW3+X9h>uq%`)sh{oeuRBouj=xQP;*)xCCk(~ZuIEu_~f_; zP65~oe+9wZVX`>IPC6YlbyVAG^&LeW*DFEyw#S0oLknJ%E(1os;!^TzZWlyv9}x_S z{bx{&2z=%sn>J;_e8Ex#?i>-)5aJVCzvQuu=>R0;ZFivr8uK>A1RcfazG2e7c-#Ik z9@3r?YXUGiIL4WBnDA7-N__qqGh2kVufMMKf2jM01!9PDeRU&Y)&tcan5xBt zsT_FJAi$_i9F@u7`jvGue>z_3$x#t0+kR@7*JoJ{5Y)R1Lin`#!=_VDG?!ZpA(_U6 zV})h{?miy&2jK%?N239ePCL#l>SoRU>G{rWzm0U>TtvJIe(Dt4DZ?T*1@UxqxKaBk zf0)#bO%8yg;}cc@-j;62re$)PoZ$C}JAC9NiEeyuMgW#YNeOe^ib^0;EkaB$qb7wO z8!XnlCNh*;dkiS#@PbpOvF3YByLR^EawP^-(GrO#vP}OrpL~*PY(^T1f9+-wlN&b- zm>z%EY!!x}eqG;!n`;WQRexojj_EZ8f6PmunKEu&ecB0?Kk~Ye(HO5uVqLd9CBJ^! z`wG#@ax5cMwDH)so?J!R^G^EG?d`E0ia%9%F)K~MSi_(V)$|>UitV7S*-}0@E%F>I z_&W7hzcym6g$?Q5W%a|FxT3Rn`XJvL4nu_hG27O~UH%UgMS|hN-4;D+M~>Rme{|~Y zFOgGRYGeAucwi~{{&jVzgCKDu@pZ^+(Cg@VqEppEM7BA{spM25on=jdk;#G9<(&n1 z*`1quC8gzY#H&x>=09U63GL^c-|e}D4Cmk+E|q6$2tvn8X4!n|gXN#(qc{{nwLn6u zUBfEqRa7*{ocmCaRb%Hz4n!91f3q%;}1S;%ORr(cIKO6PJgq9-`8m!kGQ8Q1q^a^SkG$^Uu z9vPbBlhjn)V{@QQ*r@B+wr$(CCbn(ccQP?2m?RTtV%xTD+fF9tdiHwvs=ansb@jKa z`UhO+0rj=p@*qnU(fy#9*R$nS{l{5qi0dU-e!v^ygffTu_vRr*xLD~lpHVd~-qRjSLW=s)6^Q1nlAW>`-fAyIaSnINi-;IIGo%_En7 z@^&@IQ1s-VMER5pha1%e>h|bZ+Tuj=>^db~`CQcd zF%(JD)f;t#+Cu-pWw4wJ{LrPAoa^sX!fC>rWQY)-L~3CZ9r16qq2DS?A3F7g`l{0A zEuQ5_x$^|lWr=>^q^~)N43+z`S#yo8vAP6$3^#e=o(R3^^5pbP{Wk) zvO_+BsVfT{ve4&#k(nEkP`SfI+nG9GXq~@MP2)uH;N|QK9(M@>h2S~EB-t`B7jRR_ zs4MAp@P~s+wq%~`zqYCpO9w=Wtv1!!z#tnMcpaSl3X*e(eS5v+gMJJWi$_bUqROUU z-L*K`=e3cVEl)oYzJ3gBw4Qx6?-&$JWjSn(o|@ z>$7-h2l~2Y7D0Ey%~bY5^V;7M4m?U$0$ntg?G_m};2#9Hyv+yZn@022+l3OeVPEVG zJuvh+8=mbMQRGko#BoP?OR|gPU-)g9YE@MP_FzrAZ(M1>CnqzN^siC5Vz@Iq?rbv3ai|mPAmRLQ#o4^V65dWP2 zsbNcnNFbn2vJ(_A13CB$%9lrj9x^+q; zlbp;S9yqljVP=J+nH-#&IZH zk9gjE-gG)Z=^%K%3hzAWSLyjP(?CuZoOhU;rhbc%N3i-E$$r|z8gezYae;8?11|v1 zHyyHLkvh(+XN+q%W1z%&nCT_2Mf;hQ8ePvGI9uEEjmegyj+W}#Voz=mj``%lt5p+| zDdprW zIG02glu}G;J!tZH`{VbP5KQGu#n?ybNu#tK#4#{E$W9ge7Q*aJaBZre9;{QUx~xC= z_b1%3%U5KHK0eA`k?MR*yk7FAr88msj9Kc%0sO4?(vSZNYAq+XTJ$xh*G0oQ6XS-V zR+f{IPzB)<>(CUnz>6#dxU-rVD%O4+AUi0RZ7ov&eBV*KEVUAbTKVoc=uMgTFw!?h zVHdK`*wghH+{|F5k&c9-x7RI=(J9+bNZKVm7ES`*E;lp7f5Mc&Zm*I$C16nZl1Epv zK34>3_5`uekJyIP9Hge{R6TTRaMkV&86`uhE@-oVye=zNaH!Av+Xm#Br~n-5GeK@AC{#tRYh?x?HvE}-SSl>+Z|{o z`9o~3*){w@9G~WCb*Z&*6SuRDcbL74FpbX5hF4hI=g16X?(g7LRkCoVAlM)rCp((s zi`9gMJHvII%fO7$Dg!^Qu*M&9fS_}+E~VH6F$pr&e89Ymh&=hQpXU#?lb4aJmu)+4 zU^w=edrnCwk8~u6WHLH3i77`bsRNecv~K^Mv2cqq+J$qn)wt9MilF{fVY7xab2Yz^ zUrdtPxZy>n!&?7y;d5bjbt;A%4R7pw(e5?jvsx2qNNInAwN?hBAQVO!Aald@yCDPi zhlrD1j~WJ6y=+TwBTmtUST2kX>yA@=mwYcsH-BZYw|(e7mdmn-|N8y&esd>Ce~tb1E;UW`r- z*0S^SXLY~s8jp1-LiW%Yps-=5HqMtI$FCVN0~UDW_;MJNRrQO&gM#It$gI;O{cAjG zy}yZp>X4fL(apM3_ zOQsI8$qzo(nFErsKnU^Y&vC6yJH~@7wPxsHMvYyqu9MTOQK;kx06okniUXVBwp#2>p1GQKV2ipYQReSv#)2Zo7s_p~ z*>JRd=s#QW@7 zyw_@Jn6V`&+%0$>1T_tR@eVhj1Z_IVN=gY0ehJaq`P1Fjy}l2f@WPYkG=b->?yFRW z!(lW2pkh)0@cip9yFuVO&7lfMilFzKnwx|}=o8Avkocc#WU=vjd*(|E^-ICD3-KTX zU!z-z2V1(XM2X-pzSX?V^^z6t8^em*hK;_02d_yEw{brW3g0R;n#v+;P}l6~VnVRH zKQ}KqzuR~);#m%dC9%~k3lVmWmF9iE?y7GcP&o zVa^9$5*d*36cLz%S-}mh~W7ZcV`;k-u2upOWtQ??r({_4IAF7`DKcS zT4o&52{VVEm$*tVZDgN(9&B5PcNb}@?9aFMgE=|_@Jwa`uXkw&WU>o1n)4#^(L5K3 zGxG+==Q2IhGudSfELD}{GiX^T*n)3W&0%{#0Pr5ICkqYbmByAhH1yki4V}MRDzyl| z6T=5!B=-WkYJv*VeB8(yoENME+sZzEd14h!#4dT7AYuJ+L*jghI2$Hl)@Z99&?GEa zANx%!hCzC#h9)||Nw!?fTJ$c>B!oi509mS-LluqAh)s?w1@>?uCTTK`yoin~7{i+Q z2cTc^of9SRxie0{JL~&v62F;GauvHCI0Qs*y%B8~dNK`dv~Gy1bBd7^QWq?D4zjJ$ zBvwf)jZkf@$Zks3qS^hYl238gt!{1YLEILmoC2-dd$Pg&8|? zc1KiVjX@GM<;GJ~b9OmyTfg|Io8LT?_POBR#BaJPKkuDjs@lT+O)2wjOMEJl0gBoQ zt@q3`qwY>^8%|M{^l8SUVw`cX%^Z7Sx3%^Ld8XPS5H+bZg0za-bO(fuktWx~sMUnOxz9h<^n zVV9qEdTTH;orqyIaCQct^x`uY1bBglela^jj6wRrp%RGEaDN!{cqM^#H72U4xaBzX@H(h-vNVi^LE%sT?x$a&q*1-~mIk&24NrI}Rge)4!Sia=|d)MdKvfoO??1h-j*F@I zaBE4SyCE$9QS#j3YUg~R8YoM8F0KVZ-}I?^RZ_T1tpxe)uBA{gdF^d0G)X>e8>hpS z>T^m2c}+dg^_)jzDm-^gs-f43L{ZMr^uwhaecHYJ#kRG^p4cW_xr|jKg8MF+xa_Dk zK`_s#Tb^suA)zH|wIXEhnMvoVvJeqz)r&;Zn+`4@>M1GEt{SXB8W?$0&vpHK*zUH2 z&OwZMnzX)!udOy*ysjedux3K_!>zYkzM=(ckRvKqMNHFMaV*rM81)m%j&^VXS5B`* z^m2X{#3V~hkan~n$^Ks-c>E0YG5)L2`aqUna!NZJb@0WijQ-DQ(S5@s?@DTs+%27k zpRrw^tflf0Nt}p22IuvX?7*d3w8EsMawN~!Kjj^pp9VcW;(&#lRJeTg>&F-5AkIKO z)}Rv7ILgwqCbLiC^H-A_y;ErN&YNUnot+Z8N0FF{jv}Awz1N4k`_){3V_~sSB*J(= zoEp3YJg1q_WS1pnZ3Cs2)FSwfP{WCtCXcE#-S)^1IWTwbx3=j+#zRIfAw?Y6M=UeO zfu}W19UJv;q(KTM6_Tm;cIezg#@qdtkv{*Y9SgwC!ut$P*l$wMQjUx*FISIpfuSdBCSM&v* z{n|!11C%@Z46q)cJv`~x=?+I}ROihN8y@(BJ|jiY6REVVdP)V1}{nn7B%XB z29snV_Jq{No{7t!a8);_ewBd}dD00@x4dI*;3QfL#ZkZ}%!0}Az{S;^ zKuxSSd!CVltQ7i1PMGHKAGp~k?@ci5CcrG)(du}wPoY$_yph=Oy_x8r z11*!3B23u6jG2c2bQv_fIF{%2YApNVz~iYawx{T=HUDbiBYZ$3GiAh6sp=Y^-O}_j z_5|fynuEKC#e?>d-36J|uF+x}D8Cf8o=&B~7Oiw>+dH18d)J{S(i;B}ZZmN9>=%7F znOW0mYq4oZtVjwZ6ic1YE6kD`5}YXgl@KK zGMgZ|-9XMry&~rU0bS1+PlDYH`IRt%*I|6+liYb=`<@>v^Scr}=_HfO3o* zmSKI?i3V;S%8F4LYdhGazM>LY~SPBv124r)-0OHCx!eZI=+(5ju}D zeT5?0w`dsFC>C9#t>&^C(>a{fgAEbz%~U%6C=tlE439Vptve*y2BqJeJ9#Qe4gUc| z_Zvp$Azk^aC0v}FsYS89}PV?m&ou}Uf zR{xHtv(-sUb@B;H87S&;EO=bam?x0-Wj)LZayQR?nOM)J)<|lf_XqIOu1(;v(DGg) zOh+o;5xM(iuUW5%^Lz=!Hgd3kbhI1&L0&kRv$jQS8r7I%$1; zy%_jlQH|`|Cl4{wTXPjKfYhM;Whhn7c?P$dt(8}de4S&R?!}bmij{kSTq!p_zk8I&TY6#=?!A~zukBn#j)xfA&<<(t8{Fq#QBnOnY z`-n&mEa8nUAz9pRS={bfSV6V_{AvBh|Id&D|38Kl$p08phyR};b@B0xDqpTNMaA|-pt0?3_Xh9p$m2;r1^gvQY8?WxkVKfZd%A@RwTOv#&#BPjb5)ZE)Cq^eBeTw(4}${pcsU~ zPv5Yq?@AzqyERjwEbJ`b2G=#0!f-lk-|6=Dw&%tt*7i3R7EYinE%lIK7?g)N`ho^$ zAj40V00iC!(9Yw3h~I&)w}zML5Ah+pg+oGGiNNx8dRz1Le+()8{|u?q#JgW?#2XBb zPE_Z{R?v-J9>MP+xqnjFXCKUVx^TYv^zr?mk^OSN$=vAu*#8Nyv$(mLtGBecx`NBa zd}97H`T0I#3F;16%g)N`QdbWWqzIB%Qbu_I5W|=H@Pd82vwfR{jh_g47V9~CKe1#y-1r<&An&*(@VZ!rJUHIN5}%1RKx zQ&SNHnj5Iz|Gy{!aK@E&4|D{FVn*2jZYPcL6(E9{*D;n7lXX zV_$_!g<@i($?x2uFRSry;ka*=qi>Dl@3h3w%&M#0!18a1kMEbbt4jN>ujrTAs?3WK zgi(Hfg`c0_h)WP(R?bJkmYR;9-zxz9mF)*BeE$L@x(AcDJ5>1eT2jNq4r9I+@#F7{8eN!+Ny3deR(Ae|u zyk7!UJ*dyn*ayID#LOY_gUZ5#)5AzNYsM#hpfmW_z&rBc;S*^U0HzHRd;I;A_>%D} z+@3g#p^Jb~n5i2e6SCL*27VHoe;Hhd#BB-Rnmyrsr@XUX#Ug#He@A|L6Msy6_6^+; zKSN_m>v5+6uWKN&)Bnv;_|Hh=5nAcl8}Vmt*X0BMHuy5b-ui+R{?qXdt|L_H`vo6( z==TZxEp7In-!x5(eU-b?8v*hi^WC`=ETWW0yV-&Xt+uUvF+6(>M9lPi6Ft1Mf5aTV zXQVGa2<>F`vA@ba`b!A2yS~PM zwwDzCTeV9BATA-Qz*eu5Ff>nUi4_Msl2ZZ^5Ai&U>U8&OiPU6ySENXmfIsoV8qqy2 zXqZC#ZoTZ%sTkg$kB|?q2oVPfjN#%`+9P-U%@opguBJoOG1QYWBQXV1&&jE9$iF7< zKV3ji9Tz@je(@2f=Ndg(QxU}~V}}CDO*0*LC@i?iw?I zJqgXHFDq;dg6j57oEV?r4^};$Rb8I>98^|6hgQEbF>Ep&OJ$B0uiAws6Nd|6d`*FB z$hH{Gu00szw{|Fq z7(f1J7kZ4itWYlYu=5xYcGhKM|2Q*u9TW2N^WJUoUMrqXm2b$*YeLn@q{_Z9Uj9jH zA-PcGjP<)ZR%;?~&xZsv>D5t_nPZ321&WW6*u{ItN!#GTK3vXLbmPT+e8-c*84t0p zFd5r~V0Jt8ZxnHra||luQ1w*nSF%!xOW{K)&`4v-1Bn5zjTs@~#js|4&z|U{*nBYn zOymN81q#aQb%p#JZlPaB2(Mvx3A;#Em0-+#f_8zTMkXhMJ&3t?h~dcbyqeI4UVp^J zQ3JEWk3^V$KW&PNKNeCj4d^BiGK*pr&M-d(to?r^zI9lJv zwO6)wgfJbkkAFjH#x&|)mqiFz|IiQz5>~S2Vp_VhIHxmcNH3VjEG-x?Y$YArEMHyA0WJZ3XgNVkfA1;h*lrk_L zvcRPIwpG~tjNa6YftkVD>I$=bZRvgT>~m}N-8={P`ww4voC*KMy_a%qt5q$R#fkRg zC#+v#s0~O@mk091^@>F)rdn))1hQplYg$AZ`Lakdn9XP8=4oKflT;4yQYI+zFD1V` zF8R;z&*X~x6FIjS_ItyTd@G>}dNN5dI=S5P(4%2lieDMrOW+#DNR(--?@NaQ+m7fH zmkK+<#Cu&526>~dMFXb#R`a~=pb|2`aG0VGD8|QGi`RJLV7oJgvShAL)MTmXK{mlu zUEWEAj^WePrYa@>UJ)y>E*7^0dovZ4bV#7akPkyZt_jXb(F#WC#S^WREBt*aT=rwW zM@wa_42uyxjK$g*IHKx8725fWNr1^q$-jwHB!nvGcvzMx8kl(b&Cf;9>in2Hxi{Vovz9; z5pA65A_~MX6-~#}3pG(S4}??8Ur#nTUixd4S~o=_NfJ^vQoP1kFX78|4Md(kp6&L1 z&LIc_DN^m~P#k;%)g+fBGN6^ZkuK6enXUW%4lcEk3Z)DVV`~dL z&-tXPBC^r7Y`R#pFt&2{91t9n!tW~s9g01tR6o%D{&|h_)`T+rid@Id``pu!P9Mu8f;q!2Wo_^BPVU6|&)eo-h)A^!n9FS<*++VHXUdEUhmp2~a6g8xOvp0)jH>7$h zM@xP_Za(M<5HqK<{tRIi;m>+sKnVCDy>WIansW+i$|I!E+{^r#^kU)#>at7mh=j_pAgP$t=7dA4ehPX_1E+^=`Y}DIF~Z zEkB4`+9nt3W)Nr1Zb_inon6?!xIzjQ?rnxci1OYIKrZ4gEGlIZmJd*iiKwx+${pzO zQC4+JqRq1eWAhn6cm~4SqwpYT^b%-95LyzF;>oWbOAx^wYPvftSvb9&csVF+R&g*Goq*aGc}I zEN6TG3bb1KXP*B3kx1j}<^tBV2*f9;N+DSX$cue`h_OOW*PNss)Fvi6%7U>j0Mt%`yRv3*7SUn4R(9cQrR zCf#kvn-UdGKLd2N5FE|Oa*mUz_L&igGCF4f6CEV?fZA0HGFL9HdPbqy&0`iUEN1t- z=8D~ka>jY*o?pywqEMKoGYB|zhbI9?c-%A)!G-5L1qgIxNEu6(1(BVRzfJG` zfd+@w#?pipDSD9{8)A7t%z-A5!jou&LLs`~u}B3(1$_XQ zWuSP+#P=~rj?J`+@dADdJ9cRVya64|x;}1rvb|0m{5p*LJ@A-=nbxFzoaUT?*RgN1Z5Ke^< zL-EeNE7lNAtej`35b}kqF&ZPo_C^ze=m!mUrnHuwl4K1VqcM#&C2~#65sL-3#xT}g z&!Xq4X1EE!tA`-YKT~{SVnYxbrfzs;lAU7Ke0OZqk`HcEDkHAoQ6Ajf(06O)9CN1w zie;vLypH}2vHF+sJ^m|CveTzN>Ad88@Q?#tT+T^c;2*NBcY{^sAC&nW^m*llPJ>{% zw^eJS5xr`@T6Gb>oq5CfdA=f`?@U#87{cvd_2+e9MkL1MbZl3p>#L)GTGrHmma6=( z7Bx7~=|FSn@!yoL6@AGvc2IBy3F+DiR`_nvP2>T@kP|7tot^RJ6FSYlgr>+ibephC zFR~3lvY=r9fh*YSs9Fg`Ws?T!EeMD8g(79shP$@z_;#dF!KH`x4kHWzx;|W)@%DJU zfOLWLHUFTrM3NAHu$X-B5A7Qk43RKcF>_5p70!@O(7v@5sty?Td?v!uj%?z7oulB` z;r7!X;!6>Lvgkr?azeFM!(HcOT{FvAnjCA0N#1_&#m=Wja?A`0KXj_7;rZ#JUI2Lz zrZT;;^W~Ln%e7g8fI~TOlhrV>r@Vu;J@2B&@l>#^K_}@T{@~$>nNU2+f6|CwIP)kI zzPGeZg?2`A5{q*>;I#dp%pnDv;EnTC(1u6~6h?`P`IQH{a=OtDkE23r+;TZwjaPdk zS*AVYB5C7W&p?AIYMa8OPTJnii4*|mnXTSxPw{t`GkA7z+**JYZ0+7`f7j2Hh7O0j z36ydJ;iV!ONzV)j(G(b-KzQW%6ky@GgK$s*~=a1 z66&e8#_>QFi{Xvuu)>=4u2Vaq_9bLt zH`Y+4e%^~p{aW!<(KWmv*8?7%oRf$1{_!N+8wH+-x8GbQuhM}xS;hfoKsK4L*un3$ z45Kz*7z2aL4kyj2iZ@xETiBTm=$3=U!Xo>*MX51}VSob095KG^bfR}x{I!vLTj^|w zZ(muWDFJxGNwKY$Y-j7=9cLfzCr3?~Z`dM%yHlX4{wl7V!}ifeeth3#Z;owxByHbx zsJMr5D5yfh+s)|aa>!bU^H-()p_cAb~_w-O#Q(|3X5OvWNyJVN@mR~KKlj@v(m-vq=;Ed{tz%#t9$D(8pWQj;eLDJTZ% zBAnA>y{7`B4JVO|eh`+A%qH{F@~*o44T;26WFP8F3W-vtp8Df_P-wRq9>;g-PlRpn zYV9S4hvw5wOVh7VbFgG!=Ll&dI68^?#4~a9!`!#)_vwfruUInWWMDfKyriET)8t)R z-8sE0??0HONQX)76?!rYJs4nk|H1i%( ztZX_q6dRs&dx}#LQom#o12Yo*70d&Al>egqRY6IjNU|OrH78FG@!g+(e>ri9wl-G;jHD_lrpx zW*M&4v;>qi*5c&PbhBdy9gN>zj9mt{Dfsj4#cbGh%J4NLul5>3A8$5z^>1w~WySM) zQBpqjbmATw7CW)wr?Z-S1b&Qt5L>%<4=cR709iAi)X>GMJcR5$=W_tKPufKhFVlXnYEv)G<7ECXb7Cj` z4{cMIfZt(df}SBdhe2YR z2hb%i=&By__*+W+g<#(YNHYxD-{6)vH?q`aS7D=lhAdF&5#>DTWz(L7D4h(i5012WER+&_ ztRiw%mBtf)=v;4WMeq)gruB|GY1zX{r4*?3(j~u7D)!+ajzgKv$Sa72f@WmfFs#mz z%`$!CyauSuW$u>(Z;N`xkZ33N0%=*e8gp;pEMLKpHL#&pTSX_7;6kQvu9o#Pm#$^P zhbk&XXOP>qscJ>cB7evyW!&b>TbXI<3(BN+Evbbr{iODK!l&|k1{Q*c~O z^&fp&YBX-uEBx-=_<6C}fcygzk@3)B)q^0MfQ$9%mL5I>5W|{;AL#q6Yni5mhDbGB z)>SMaKZ8?-vttKzS7R?iqTK$r!2Yzts>H>Wrene571JOgp1ve%sMQYE5^7WIS7kj% zg8h6r!esn!Gm+c#_@%AVBL=5@O%ZyT5f2>^EI-l%yT~7_$U)u5n!=1`zV?T;PTirq zN9=T%$Rz(X;4H`=y&})PNKXwI@xNWLj1+P==^J8k53G(1igHQ6^#&RWF8(E)F`vLH^wh0a!$P!KeaYl#pB!A^=x$Gf#-D|rRd%sjzg#-`PYHn!n) zx4CR{N$}xl`n)W>6{;$9CPJokb}^FlytO4I(S0ygs5WI^z-osxLLu8;{nE%{ld)Zf z{aJE=oI~o4e7zf*k3*!(t^09s=ke8UZ0l1+hXpna{P6TF^`y4#B?`7&md3ZZnpmc% z%;_GEm&>KwyQ9|JEvH5&lOH}cn-aHzeM%wvMZcy}?0-5d@rr-;+Q5>Hal(2!Ol6Ex z{Nu~1Fr=O50Y9}XK0v5sV1WH?|78v~Bi)4vkf_pO$~rC-lFp(M5|2e`P713a!i5T2 z*zzIib1C*-Bjo6uU4*V{A%6A?C3`B}jNoD-G;sWUjO!MTfa);NZ4~N@@Tnx%=izgc zx3WgwlUeHdB}md6^<=HOfy=y($TFLTlIh8FomCz{7_9y`d@Sl2<6ZVa#MrZeYhqy^ zz(B}M*D$2vl<*l}ZN%`5AHDP&8M6+oTboyR?osF zc|2RWTjz0kQ9RTA_6ezxLU3X-C3m6>e6?9bY#oxM26Y;nuHU(yM~l+wCWNwg#3*su zLKi1*`XG+!QmD>A_-Zuj5)tMZgKbxzQ{NO^mYCm#bTGR2^b;?(FQs^cCfxg=o6+0s zn^cpJ{!&L`@|>qNu2+Vn+CxL%Qmh=F(g~|t@LCJ&ERt}%WyRodBp4;Vvcb{>EM?oy zk~(!E+As3M({Aw$P}WFT3p?2(n}4F0SiC@Vp7X$?6;QGok@KP;h}Bzy@ybcN+M>%q zlr&QNpph}=ON@8Fx7+&;7~m5B>-tG;rMZ~ISBqsvGO*-}Y&Uf}z6d zuZe{t)hS$VY<^ z_G*##($gyYTaRR_EKfSiWb&wGOI|SKm9>EAqNVcW^uz$4S?|m84k42{KpKVg+)Ujm zK;$!OtGF&=2lj<0|GtR}mxVxD%=!>yoL z=w6I556L6^h_3UBZL)5MH~$O3Ee4H51>5m4q@fP?VhFfk;zwl^w(O_<`F8Yu328C> zxVG<;{g5F0r)hFff@@&DABWH_%Ow5t5OT%Q)N3wrck8Hd$6H(sNS!QjN~A4WzCLIY zPeAG}*=7bJ^<1PCL5p*VuG}DWHeh*ZIuL|9piy!Y7RjF87Muqha^-$i zn;L0S-2T&wx;q%m!ELficJo@KZ`%qvgB$rdtS9)#V#Uq=hhyDYJnTV~p+-2qcZLHY zKClpc-~yt*c}X?YB)#5gJbSuz$0oL$0E<8{}NPpUC?sq{TjD=Xi44N8SQsIpaTSjwJ=x;Lmv z^&U|QfupUxAtG(UmVvS(O)5K@q!@s1>u(p^_}5}XnD@b3hRDlG18qs86ywGmm74Mb>5yh+v9ATq?4 z2Xqhiw`C5QX5_*?6k5y$pLCeHBo?d5{{z@Sh%0y_G9aJj2&z|Z?UMx*9e6Al-yWKP zV$v?17!+Zjk3AJ{-8o0n=O6Ob+6mZjgwUiG{Q`gliWbNYGi%fKM-wy~mT=h=$|63a zHu79->lzkU=Yq;bw&vQGh|im|dG%ZoCR4=*+EuoVA!H`cOB7-K#N2)q_NT*sR!0bj zN3m5t2}{qm71c&Z*qUpCnb#kskdusaJ&Y0VzY%5+k{fZu4Kqaj>1yj1KD1JI$49*L zE^7e#23APIdtMjrGT!WuaZSbz0Z-q{kwn_2QK6Ha+WTRn93PM>4 zEBS_#A-_nDwjf#Osr)pQQsL8MLXq^az<9tSk-)5T9n*Qaq76H5O8G0!)x>%k@6rgx zMw9>o)Z(9nV6~%)@m(3&n5Jt+yH#sO6BXXQREf^r_#g{LAvuS5(=@W@bVvf_8X=j| z-ILs+nFe9`csQlDesq5f|4pHJ={pbV{{CQbtMd70x(i_DbK?yZFZgYn{qovI8w4c( zX*-=P!}^)nT;`i;!MF!}XRz;}Zf7MK_kj%5;%K;~SJdDV(`!@hrtngVZSU(GIEV^} z?$FY%9f($wSU`X9F`3^T{c?29(y+^dOiX_gp)yX+lg^M<2-=CBhDib$nN1m~d zK;1BTLv&zIK8VT(NscB>Xdd!+evPCK+2PbMa)>T}IFv*t6hYHc z!iN>kM++Nh55HPSm2^fjI@!D32{HFF%iR%=_6J2O)BW<7*?H=I1FrJne3!|Zy_Imj zlOa%_x4jrGSY{gr@e-dE0Z*oMwL^`Ut*}B0?|pG@SaT6bPLe{E4+!k6jyOy04TI=* z5b&Q5FhXA;;)P34eNJA%Tde;=ss#O|b=i@s{ePrX#vP0# z8D8zFe3yescw$Q|?4y_@h_)*ev*1i{O!aGeRQOkuc|i{%xCUZ zt0$)gt4W?#0U?6JKvFN=*b9l+Y|*wA8a@B=c?<96rz2)GwCx9uOXCmlYs12`qFe4| zW~~!t!E7FUU4CVyFuy`>lD;PsVgMU8lj{|ZlbRAzmGPeYIKf!)4pb(A`jE<`QO1J< zDJ1m0ejEYsYSI1|JUAi$yR8LDG}!0JYeq1LXbHlG$ctE$+hUzZ#1xaz?AVV604-J7M0SPD=EC(+B^xf%Y`MsRvxqn#U) z#a4?Zi=Oiizwi+*Xi*%ZU#U=%h!4lP2wrH$7B>d>7G8M6N|^N((R$PUVzT^@nLmCt_Zlh5g> z(FDKw&p{_ycSY=z820)~WhBfFdc5$o%kFG@IFqm_AoAO;n7B%=l6rV;$ZHeba+|j5 z0&)C69Bap`FdS@-EVSQUXOF>RbJ2t@K5dkTcIjkp!78tPlI&Jf`{1xfZ?l4{er+;t zLw-ofm97G8WkC|5K$b8`n$1d|U@td>2gmjw1*Q@%iwC#jIC9yU2V3&`>mX(NiG0j# z9EQRqaPpEjK8Th2Y=Hb3#6s`+1SKtd!g3TG4h*%!7{A}r*&^a5sjNw36=>MTbK}725P2wXt2*N~^Z*S_^5$N5P;sLVkzJ-b4xWmlU(968$)N zj;d)b)ShzR_Cq+FK#!v^TKpA*H)hQV>L-f@U~s?OQG`T$pV0j5p_6PNM4-A64^St0 zft+j_-_4&dh9r29os8h`k$u(QG%AFGD*QQXIEX=iy`kJTLzIs0P_&=!oM<7UzQK%J zvpiwv^;Y%3>5HxZUmwwhSz^0!ivhE=L4^Z#ldI<|Fm?7rMDyWolO+G_p7Gh$Akbe8 z$T7vXl8x43bv1cL6Oe<35=?gF3UQtu2Vqux(1CdVZ->sZ#e1nJ9*gQzI@t@Y5TeTBs?5M(@v;mU{MJ0H7J2 zm7J9YM8mL=kz*G1+e_VG1F%6B!#XVt@(BG#sF5l2|32GCLEG~=39SDWi79gcj`S7- z9?9-ck@c}Xevl5%tCJmQ``JL5NIbQ!^I%O7xior;Ti&cn1febOi2erFfnN2|d40LE z8D<{rwjhBn2{SszWw(bGJz{9QL*u72f?ZJiVq5st9#nZ>`AE}!y>5jz_g^t5Z8v>e zV?P#hdh_uvD*O2rS@}D`zUTjS9tKD9pRk^JSaQ{!p!qGm|mr++XWX z@rG+n(V=mfyAlgljm8=vMdLZ z7P78>&4J1?4>=wEEz#5cmvNQpR5l##_NC}?mchQIA^MYrzg2MVQ0IFJtyBBa@FqE) z!D5+)vBiEiV4@2{dF>K@=wevM=9^FJhoWzO6Nq15bF*2fG!YTTLhmyL`ykKdh~_y;@uE`Dpzy zX}wrz?cF>XsM2}NP){%2oR?|ByIyNdxswFg{XitEbtgs6tSL9CK|T)28~SA~@?aRO zzaII`;i7*(hm+~(K!%og9*By%D~!MGnIp8DyBlpRcwq~LJ>i`KC<``ZK`cP(4CI*~ zUme!d6q99_P9-CN(@F%kDNuIx{n^l^0u*0;6tC3K;b08#q;@9+nS^yzL=QXmC3!}? zz=0%Dp2)-a-O4p}Z9iPm(g^;XqwmG) zqvu%fWTxm_hNbFjPB#@^OidH)zJG)(!-&f?kmj}HD?EPz>Gln4*CUNRbn@~z7F>7J zMO{;Vo>Xik$INcAGf_u0){ZoBuKT+OGS7r>sh$QEMN<=xEKtMWcW0gb6pLa*CQb9l z3@3pCA&0V@;B9?k!X{C|+yw3S*0!sCb)Zw1x=setrEn`+M9r-u7q{&qm9u-_7 zh#U|T1u#{_wr1t##^c`F)(kDg!j5V3vkuf>|0_ukw(!ZZ_>4;H7&qDnSC>EIC^%z)EZp#BudgODGRCHtWRsrd0gGS{T32Chi>ySOglyqesDLrZ|LM`DZzso(? z@u3Hke1Ev?vO)LfzHMI-hB4TxeJ1grCd`qOje(e#xUL+(6;L=0U;L!9hfDG*JC`kHJL4p<8D&r93Hxg@sI@-%mgBEe6 z5r2@m7$a{1Cqi*)EmmG0(b%k?Y}c7auvSm9F=}xU*_U$Jrk zuO}Uq3xn8e8{E^#(TVzZ=~ae!VhTwRh-E!^E<5A>VV79NyLQxXw=&h*YtzZr0b1cG z4z&n;>4gT>6YJoo6C??28m2_kU0sgJ`+t%WB@qSeA)g4U?u;N&G0W}2M}c^x%t6jJ+nMQFbTm<_qLGgkP9-XwHDngYiwFjLK$KJLbQMs+e1UO^{1eS!+~rDf6!e4KDU~ z9(C0d35rd5DjHB{r`hpX9)D4(eWTIm=(QGkYp#Ae2HqeTI3kA?k~6$_*9*mzm{3A; z)wHdz;<(xBrm#?hx;JgNNLXQWW=g;?AW&n%hz8~!?ggw$3zHs|YCBOcg+9IlYjctYcDP~?GBkxH8Td2`OwN&3rp|j$Jl|dH_ssJuJk0N;cfC0JdT+<}*4RUjx}3yQ_(jP3{$IbqdMaHS)ajeuej~o*!cv zrEU+(0<4(bIRX+|NShQRjkwII(I*DZ&=76`zD4My`<_GnFn?$CR~4V1$z8H4<5oS= zq+)W=wLWcBN?XFWPMoJgg`MeeJaK8IDWJ4u)fsVA2ujLE99p)s23U)~&^6 zsGD26P2tA5(DYlsijg?_5w-6}lS{?b<~TnNNXdbjN;j6l#jJ*@KQ5OtvWQrN?0LEH zcbA|7(bM5NLVwwO1g7g43MsSXUpZvEv2^xdX3BVUd=ML|cD;l*Ixksh>n-D6UGB#b z)K8u+lJHYq$PL(izZ!@>>f>0M8F#@ZS=Q?vJ07Y7ez(i=e>>JJ`DMf`XiCz4Yn_QE zyG*Ua+TIHTYrJ-Jz4Myv74)!-;I!jJ$`I1 znZZNyVoL#HBB4@I0k>XR!-3xj8hu#yW{VkzG1*Zv&O{L~JG-E9cEwpYl-LA%WRYp{ zaVP6dygi%dz#|O&tU|CjZPFGoJzyD`i&V1S()WJA{Q$gjkRtn+K?jfDW8X3F&Eof9 z@SGS(zkejiqMH<4X2b9G4wS3}6$SmI?N_nj0ZHC@)AH}k>h-aWaCJc#O-N(@-X(qw zwvhGC$e8Q>GE$ksm=1Q|_Lioc807zB99}i!g=M8~^ZW_UqACN7ixPDD6&l|;Q!j+D zCf3%A$(8~C*hG~VV60X@W7SbmJ@cJ|egjv-`+u|4nJ#?`so?l3OfF7(+F1WOnu^?= ztm4=~{+@>6edXSW4RPHvW zsy1zc@HoWVATIln6ZK4S;8bO?%q-!Sb?fa9x6`osigZdN{V9kYoF8AmJL2y4HS6wt z1AolDmG2;nO2pr9_{$tU=yJ!q-HKN@T_Qc){(?E&r=z7eVr>-`iUcHl`EL0sCIyKx z@Z4*dz8M@Lu8u8#7kus5&-~6LQt{ip$Ef<^FxgvMIz9!0biE~|YdOZ}NuuBI(Z8vk zjpr|zp?}@6qFGeuV)A;>X()qaNe&guOMlU+-1K>5ypp2Bh*@4hJ-=>|F}1E89jMJ} zqyxN1pFs_1_(B+A%w$VQhdS!(Pk*SmqB4Ksq>%=X!?vG)r7+_)iQ(JU;_wDF67+v$ z9u^@+0=xi}rOF5&lmv)gQQk?vT1b4}ktG%%EoOr5Jlj+6QR)>n9NVPTR1Bxo;C~{d zHv9e4NkRR!(BddnE#9MIB3=ovXJy9~tk5WZV>J6>jwrUkY2q8fB?zcVNlJsP);mi~ zmSGSJ)&r@V8=_$$ndOWm7%$%|N0+*sV><@5oGX{ZeNTp+v~Bv4O%0cDm|XK@Au^QM zZ(LCjFp`Zf!sUTu*&+km23DKST(y?j&pG+6{8T9?=v*TJx{YV(c>jZwxChq%2QTI4 z0JH0oe+DixFfuSQF*G$aGBYbMF()uECn*XqO>bmGVRU66C`39kFfuYQGB7eRG&M9b zGn1H^$N@ByVVX66iy|74`UZjmv0bNWH*Gd{vIPtu6q&Jw+LBhHQ)hqu&X9^DI}M8N zxd+nZaQJ5CJ$a-MJP06-L4XuKNSYI%U+IG&QzSr02;&q)K!%9?A`oGi0Im$aeZIB-r+9h^Sa3h*jhDhG#Esa2_4h$!B*gV(Bmt+q|BS_*CqmugXDgK7se zj|_2?xPb`4m#G`abB9nGy3S$P``tSZ0ED`(nn|7Iic+Qnj;6J0N~*(b-|zJKL}yx2 zd2DcSOo_nhDd1RA80$5bDI{h|zFBCUpLD;nC+wD zNIev}x;`I&%o%DH$`F2!|25{sUKqL&B);$Z-7MqSrKi(ld+2#DMjX<^90YD)cT&0% zVsnT0sV8YAJmI@(XO_L0U2K@=Ro}}`61rz+1K&UErT3%6UK{d778W&#IdwZ*3zrsq z3Fl@ra5jQP|8IyHLho_fZ5ps9Su=%fe_y{(6z3M$1<4W1RGcWMi;YctP#?TG*X zBh}_jQR}L){or%9p_8-Qi>r@=x5LY8;gNlvEvY$KhkUIn3%0s#vaQoA?fL6h1o!m}5@;u%vmj$#8)R4Kk{Kw%Ho* ziwA}G^hS3EwEm1TWqk!w%B32&e|1f z)T>4*)ZZ)I;sbui9X{f}s;S;s=EeopEBs#9{;3Whhojpse~}K;>8nDJ4n92`eoPcb zJsE^Jkb$>*8KjiIE(41&GBRA?XZ%7!+~8LuLv9(i+%ln$^-7g{R`-NHk@iFt(?leH z9q<$Wm>yi~_U(`qh;ByjN0>}ewj>hEHqS1rHY zjq9|!DT};XP&VD>yh!^Q3FVo>~ik3erw){=t;HP5I!pyKZKlB`C zBzU!S?3Ld4!+&jAd_L5DU<1*9aYj+5Str_S%H|1k?Spp&(0X~0Yq~!cDFJx_U!<%c^D_IU|^iK yfB^tDjtV;pWo~41baG{3Z3<;>WN%_>3UhQ}a&&ldWo8O9I5jpf3MC~)Peux6mg8Ii delta 38783 zcmV)7K*zu4uL9e?0+1vFGcX{NF;yp#P6&VJSIi>`d0KtHJO~IR%ZwA)v1A$!kmO-S zuBZjbB}EdF<*(1_+f4UN&n`t0NFYEWIkjhM>Qq%%SM@CI&O6#2AJ9F&8Gh1^b;1&5 zO~)uFoT~2h^5A0!&~X=#2b2hBRrk5`V*Gnk{A2_w(*N@B)5{EcGrOm{Kbg=c8~lH7 zPK~!=kLrA<`ggN+@ImeT8qi=t`YklN;E{GIN1Pg+IIl_$N%ThSz~K(AuiFJJ1DVif-b}*T2KWgMwuF2h%2<^P@gp6WAjp5AxnqNOKk-${WUMO6|{6$qte#c z(p!y6TZ4BN;3DIO!f;uN>)e0O&uUAn+S1Yt@rPi<$9gAc;De8QDT&r$C&%te4eQdc zzP_An>*LlvC^U)~U!+l}3=0Zho*X>?fw|5Rr@8J<-ggO;ASY+tzYf_^FN8S!`^%&M zoc!(J^~s>H#zNhj@nOmM(e`qoXAmN$6w*;z!mN7&h76jtlb|S&xT=3mF@q8&a#Tws zC+M1GpBRh;S~MA|w-O%B;3@V5JUM3sZtbLr>rdcraj80~M-L`5Og zZ`4u|E!7jy6jTz9y*gyBaBZF$lrUw={xqXG0Z({l1|xwcPpb$uyz`}(J;}lp#$eC% zD-?sn(#p)BgsIZDL9~BHPl+Z9BZC%EfxR9j40+431fGDWRM>d@maXHOYVypWgh@R4 z6442KLNrqt7_@jwiOq|LLTi|Q0-j2s>?s#2t=6{9GlLSQ%-iD`%L#bGG7%W#KZf1k7g~}5^855|{dBv$fHKSzdU}&3;AymkY8-fYoVRU>OMU~$l(JktQeKCPD zCeTqcJ`>c0Pc*e3ZEZ3_QKCpx2FUF`) z)!Z zp{hk{w07sn)t;i-6W79chS}=(?qHj!dn0bnlQnPsahiYe8(x$iAcKfPg&2<-I0~8d zLpQ3Pa6qlQ11%tQLeD^kD8B)F43m^dYmJ&~vP3ckfroL-Nf5KSE+(+mac3e7GqZ#; z)q+ttQv{em`|fNQfF>}EXvQBOd6blx&8!wJl(f!8U5b0DGILk>sZa7~|I2E09?}xN zjDNfhhvR?A$HBwaJJKI4PP9^NC(=X<(aT|u*K7ttS3tQ)4wl5ahC|RmP`~2_zo8%q zC+AGDK`BSAOx`t6CFV9^?182fb%9kXoWPQpJrbjn<0^R<{it=k5m2~5`=%y(*FYI_ zwxvudgB`Z%UL=+BuEKw&LLyg!kkI}J4L0~6#;AYjbpZ>F5>2n`6mhIgi(FC-uWRjF z?C5rX-o*GE#kF6PDLNc&f~8c?J+CB6v@pr1l>5^wI@hMc_@Y&Ey(PxA;^@07DKty2 zSJ2jFo?Z z;wF-NEz}uhTfujXOPk!Qe1GE19EL39T(@?;c}s?D6w>%Z2gTC)Q0&C|vK`xVl!^KR zjy=OD47v+PnGuvRp(T5{+Ma#hoZWqhGxlcde@o2T&u?dEoAvE=tCmRgBa7x?7zHKj zI_}$rcxLv1_S5-sAF)A-pinQ(J_&!7L4oa0>&vT~FOll$)%6iKhw19ys9f9C-R9E~ zw}Fe$NU+1%M9HyCXRfo6v=lAS_(RxbhlaGJ+7jk8T z>r5_y63wFUeZUlizTc#+VYX0)QxR+#hIcesHd#O1*Fyi28J*?>$25a=Eu9}}&;q8z zqQ={fgvy|hQYRm7)~mDI7qiTmqU+2^Mh`Z>=_Z`}MA{D-!kI}>_BPJ{PeBon4GI>7 zm3v|mF;z(TVn_ys?TCnbM8tnim&pTr?Qd!Ia2H;x?v7LjRiTngp(<29eD%y?RrluK zRt&7fkfZj*Ac3kF*u@x(DxWY0_w48Y_rcLb)5it}jUo-lfY@l|(#!OO%Ak6C(evok z@JV!{)em1GyC#5dD<=M|Ub;OoNuVkwf-lBo)cJKWf&a9QtV?-r?D~JAvm5T-);iKR zGyCzYnWbCf_dUn2d(OWbTE5Z^FFG{>Prt5LpMKS~moN$s>?Ja1;YDsl1VHfe@v~nq zuVN#P4$8l4|1S39uQx~R@EkolJKV<41rbD!ef;syzr_oX9qxMd>{(JqF>;IV;>G#J z=G_r$=Hco1F~{i6q;G%r^R2%meLrsf9B{Y3@K1Rg#(^OJ=lW_}iHdt(tKOd_{T{Dq z{PZu`Zbpy)`e(AG98MTlc@XnC*7;!&%Sfe?{p#xSdULV9$()&5oriht`48NduT!zq zq|c{tSN{(YZ|J}O`TN%|e>;wk=-*#mueX=i>9fn#&FP01x0rtmzr4HIoSm-+`KLef zpSQPbo^JMki$acFp}Hz?1#o-SLmjU#731$VcdLu@K)FxGSHw%`=h@w-XZc}+=2h>6!VTF6Oswx0~5BKV4s4pKk9e9zp%$Woiq)e4{rbOAcM1I+<{5e6ul*a81$#t26NP`9#n`CG2$VBzXv;tT|Q2yCs* zfc7pQ9&YyLKxe>5a)6qQJV412X#cmd{ND!jfPeP}z{1G#pK$;7{wt8R{lA<|%*-6@ z98K)KtnDoU7S^^vfRdy16j1?~ zd=UIMJr^@)Ye!cXMi*<_zbG>O73QPM687d|4t91xdsi2@zv>gWb_SY#wB3v8@00yx z@8DtY^KW2bZEtSz7YTDWMYbQ6LjQGDzK16VTWR^fz00%QOGZzmV0O$k&dYV}= z{S{t+-OCa9FD1)g;tvLXK8_BK0E-V2KtF2>;KvV~kBf;r5a8oapOn|*w-{6CNSzg+(RCj74`|Fq>t+hQ+*}=v7uWJRs@NqQ%OZTx_ zW}iN;4wsLS{Fe&&u{r;pQo`QM!ThfkW98rkm^eF|c)@)f@(1Do_^^EJqB+p>UqcLF zVzhT~{cr(%(DMUWI5@-ob)uY{04C9YzeIl{E&!9*AH)M-lK2neVgWEo{Xwh%Ch0$j z4ZtMx2XO$HKeZ*A#gFa%a{Xy&iCiOq)BZI~t^pQd9 z5BkWU{U5~hp)>iz=p%}W>mTKZsp%i|LD%d*i2cL#;{vn$<3Gv%(mT0*jP!qOJ}6mOyZ>=w`AhH*{YL{V9Ne7$k>kU_@*nV{D62muessag z%hBrNg8pOiA+!Dm{9tJN5BNdR?hpL9CYb)=`*ANZIsA$KVf*pg`iJ_XI>$fi4`0WR zSBJeV(Bcngc9wrjo&T#9>>tg4ar{_zhd=FO|6t~5>*n&0_#fo{L7nA;$saApUkK>_ z52_p=<}M$16JKu|>jP;{z>`74H39k{q z`TY9U3?0#)q|))?`)ZV+I(yIow=#n@JYD=tq-lr{mViNB_|W&w$yYPz6ZkjK9$AWf zCpT^+ey1^PB|f@4tg%mv-ka6xtS-H{54T7>`-A>a10x?OO*L+$UDMS@!dPQvjh<_`~S?NevaP z7)H*dN|<#WM$zc54Q!u)gvZ?$qMM!fntc{FX#{ILb4~An1wg7s@9uups$n> zr=TiXRTz>x;g9fLHCPcZ8NjU%6vp~|*BZ#$*roJH4T6x3vKdQ}eYg%zX4Ey-s(JMr zPZVEG{jrdL%zFo9V0mtGk7RsykKD($M!$@GunoY+@g)P?Hi434c`q>YW?O*zOVn?R zsJ1N)50U0If`+VxVc_=Z=mdl)8FUvLGlt;QsWhOtf+~AUzJw0reJLr|X4gAM_?bAa zO`z6Azi|uWjz6qQK)+W9lzdxUC{p7m?9AaSXG-3GoTpTQiv36xer6QzzI7eNQgSZI z@+_P@D+qRjUT!Ap?OsU$4apaaFzsL%vt!g~7?3^wMJkNZ#<uU12et97MG+K@sn<%nJT^` zNiJJ|dXP$TB*%GNsYR0e*1ZQgsPIo6O{KxZ&l>xG<8Nw$yhyACpKiT~VprE;u4WuXSpqC^@R|0ahh{ z=6TJuT22{da&p_3ljrI-h9vAAy-=*JN}7+krDWlB~G%Vg6!?}!K43ET{N zHwS-(YSuv4zWsmzH_1 zHIzaXd$H4lw=eck>c2d?k`6n`^UW21^~Kt&qxjFzHRD(+f~1}dryCn41j_~v#orpBJG!d6|4n9tZ0hH zX^9f}9cj|ULu{yxu3O{M7hD8wm0hT!VYOkEbahC^jDS~mDd#XAcgaJ{26{e1l;Rla zO=)XJqGFHS+^Xc)A*w6C-?@k6KfwXN(9^ABjP$i&LN*F?9?K=kwXD zoPZ6gQDMV6*3{%%Dd?PxoJT@Lt5srD%H6T1@8ssLS|SQL9S9Gz%oETT*byd`+%z^ zX)Bv30VKK@emX03F^|o%NAgbSgMqFoIx6N}B9PDZ#x;mVy{vSGk18pw3~^T!?~YcG z;^HBfk+=R$nIy7eiT}gimeUI~LKZeoQmN&dUk>)*m-1w>Qmlvw&Ug3lq=2nC+d|5t zmPeg~FlCK^GW(Z;ubRH>GSAD1VA)I=f(wyo&1 z=Pbq>+-{6FZgl5+tEA62f_#_s0p4THvlMU+9d+CJp5UX*=`O*jOQX1GS}c;D)8`Av zY1;tH-*yQxEuNZQcP(qEvpJ*&WZwQbX=RS?*L{dNgy9Jp_bWr?jLxDwpUE-p+jPWg z#8{pOT`SFhg~=Yuxj|+=DMlo#p?6#sB(LCW3`dMJ7&xJz#-4}XhP9HN+nahT>wq18^;*dPU_O45{M ziGy<2;O;DHNC0_dd>82~8(ZgtGS4#4scwu5PXs7`mGNE;5#p)yn${DqVsgppsLTA_ zNOCK&FqB0ILbf5H)@B$bE4v3;g{?RIA^8HU)&+W_$7a~@sS=Ord(i=@QfVIg_ere* zT1hm6O{P*~ugWhf4X}}8$@(fO{rHl!l^fRTSXVK}hKhiDbE6^!5p}0EMHe-0JCmVtWf~^ZOG>}63u=0LVkOr{jiWd>6Rs(JOJI?wbgn4|39BRQ zrDouN6_y8lhbN(CD!pHOMXa z0p%}DHpO8(L#qWOep|-L=fA`$wFm-7EBBXFw7-!m zEBkIiuD0@GEJ@mf0FFt#G&rY(0e>Rk7?>LCBs^P{ zY^uWA_s|qu>Z|z{au}j^;F!t*lbxd?ha4BE)*CR&mu6snWy>T=I<;m;_uG6OyIYKg zU(e9s(ZFpv6ov+1mSjACRVU^2C&&}Y0;mkkg3dwtd>*bG#j#n5L|xk{(My(q=<0-w z0e+ww6w3pE>}I|l)e31sPbFO5s#1sHAlP5c`L~w|2S7ntN2+3RhL4B`B8I>KqGA?X z2LvlROqe4uadUf?8V~LuqOYH)I3Gfk?S0~s(fL;}pB!cS}+v_P8`sITLooIzTD%gEM^muKvCd~DW@*?eum z+zD%q+o@!1^WOq?8b}VBUkmQ$mHWS+`Xm=ZNz|oZ2uwMe92>$1tyuk1`UGyA(|Vq@ z=lJ-HP{Zx!wL$KDow|9G;s$5oXXZ3O$I<8IPx7{Du$c3V|JZwqq(NT;+U=T8b7;*g zeVyN}6gIejPF-sRR0=vFl+^70?KA_0C0Oltyferk#T)tKYS23*H!Mq9SDzwI`qz?zt%~) zaTrNuK|i*vvXs6cyE1)Qfd9P{GqHOKx5NIqfu+Vi(&$B8(9aQy?$DyrlE;di@VR^l z)Ij(`;mLT#aPFIw+&v#h6;!du$gd8FeHmw`3z2d*3z6V$9aL+#>cIZZ{ekPQ*^ART zG#zJuloLYnyL;Zam^E`EXw?$%O+b=}D=NY-hRS$)44U0`lgi?;cB|cK9=(M!w|5w| zBOD&NMBGr+L3Ozb6K(Q+56iKnuSy|$yI`Kt_)a4sMP3jx)T!^CHYgnOWtB3$W-Wfc zRz$tz3Etwn=o-`g@g5yIF?3yT)w;Y>7e|AC)5SyCEIrpPwgIcaUi=YIW4A|PFg#y~ z>QiFOrH#nr=FDNkHBS6Fi?P-yGkm_uObSu1n zZ=zGaqeTn09dVx#J2J@6QjQkAfM3WN3j&Or%PIPeQwz^Gd&XQUh1%W6JBAJQUq90( zVlKU@eE(6dXlfBgx8B*k6`Jcn3LCRprUBdOJ$qyM({j>eKQ`X2Su%e%gMR>B``K%= zt+ILs?3uO`Y}E4pon1(q)u2C~y<9qfbFwgN`8w0EZ^lU;=+Ky*u5otbkMK~Em^i~F zcuF%|e3KzUS1=@MS9{R9SO z!^oa*9WZEXXaaYsiUkMcobHWkc5i~mFsr407w;G{wN*GZ4?v9FgIawVLau&#(j}_)4i&F(6-Hot0HS4Pjbab0kBW>Q!&~6I zfu$EECM)~#K}M0zcgo$*lT9V4=eHtBK0NCm;k`W=E+l~y;(^2SWx)3O{!|5s>kkGa zO1mv^zN-wq&7@Q1sIZNi@_CZA#S*jK5UWk_R@#`No~=`~xymL$GYNiwFHm|v&n9%% z+zRT)^)#se2GuD@vwM-(*J2Q8oAU(D;n0nucT_MV$%jI(NLfyWZe0m zFnK#e+$*HcdnUXhd4YYT_zn-Prro#6+`wN=(_yuvwFDK^#cuIUQoPkFZk=(ZNga&F zPz`dHfhf%0!|#nAK2leIu}uPVMhx_}?g3s0K1LN1*y7#4aDU0xIj0Zzb;b4U_ivpa z9L90jlKw_no_#SP;2OEU(Nx-rEh7=3Qo_g@ODLQ&@cP6Vp+_6{g0H+|nno{RV}gsN zJhU>*epvG@`lLQ9m@pS?8f)IeS7UDblGzsE6RX8xcgox4-)SU&;v>0{gI_gDvY$O@ zYA-F(qsB3v7I=5eZOBT#)oox#T0uLFz?Woqx;P**d=pNDlLMFHirXWt@l~0L06VOF zJ^<)md%O6C1^X2Z$}cvnp+}YQ23k}u+I5!oi?Ch9EnTX?vknU2CM@YSnYXSay~<|m zZ!}>M)eu}`nVqhG;RL=uA^Sp&yLQ{K%noOX96A4v!mRC`t^JKVA?w~xFGuVP3VYog zHl3om6(64sm;Rci656b{vBj^^r_DZ<+{ut8N1R7JB zOFFPLnB8P~tbWqDxo>D%p)~{pwNBkgOAh$r0ag8YOoJJJ^(AR9%n(D7#JOfb$Ge2T zB%Bi4IgL-mTt*Dvm?<*I501eG0LtmDKW=-CJnWI~gR$fV;e@RibvF96ebp zQYl3hGKe*}@r>OxGdCe|!MZXdt<2SjT}O1Lv#&MA&?q^ECZUk9Y>+uB`gu2x91Xu2kq#11e#&PkIRKK9<@ zew=CsHv-B=duLcdpIg)cohs^%DkX}ldSy`$N)E+uT1uRijON2XYVd?fj+WDoMGWz3 zv5(~7Yd6GN`6ygBCYHiFm>>_;=-RY~P4)1aL_bA;#*x`+tuyH=HwA2K(VGesc!qpI90B*RU~UK}KduOItC+K?izh zM2P2q)i7<$Tx6*$A5f6Zl%?1xgcdSe!HJeQWNVZU5SCfP!kKspF3QCSrPqvT-qxch_T1`wcs_5b*|dj->HAN z%#~GeGRAJAibz0EsT`K9y``Z>%I0{((Rg(%Nh0&G0b#$CVLxG>lC}s5{Fd~0m~EjU z8ozI-_NM3s+xj&&fh-$CLj2ic6_I52Xrv`^336HHJIX-l6|;Me+Y&C=`#AfW`_`g= zX2C&5#7?oPj_eCM?UMn$L}t(Fn`1n#7l48H8%-1KDXi4&X&YVq3&fSHeMP*$YJU_A z3XfLD8)X*#NuQ4y0TvBk!{yTE7K0g*@%q>E*8*oocw#(voWUO%mPpQnvv3zFFER{9 z7iq_%4a?r>1cA5Wp3i2MW|AL{51Yi0Hkno6bZQ}4l&&tZ0XN4SQpo@vC9~t z$MT?cgOYx$E|j~TE|}eDlKgtQK3sc#svvjep_*BQsuM})TBcxc5FmljJkbm-=X{!e zo4zR;d&K!7cQFUthvo@2iKHxz5D_+Xk#x9kl(cc;W0{WzpsPWYoujf1=sWpR^A_f5vaFzRlN~A|~EQII2P$-tL5E z%~+aDXUS2Wk*j^}Sr?IGe2@lJe^4iuVrIN*U@#Z?_4@`=w0ZSZCW8F8*k2v+`sPR* z<@@=g>EoYO8SaPj>^jl|nsDL)eR0cj!4d&M;$~Z3*J`T5k8`4{$5Fy4C z-81RGl4$)h4EQUCnW}a{1??*gNIVT7`d`Bgm zx~6-p+$nno2hXR>g1>}>@9Ac;6D7Q*M^pE8?9t@9U);CAoxOL57Snimx7G0B3||2~ zP#iiv-9hDuTPUIiK3BCwRAVEi+s@Ms z4J>p1GWtU^bU0)+@w39}$XZ|zPv%~L5DGv@*0u=NQ{97bNOQf@g1^q`Z>HEe zUKIBFJn=@Gs$Wl_h_tIBR*g&Qp=NoT?%FUl18A4C1CW>MvZy(Kc_^vc4&~7rF$ngQyz0oz=C1ppLDQB~~8I z8u;Ya&0%NrOf|U7axDkPzbj*8#5v!1o}Vr%e2vPneJ_xEa-b3*)sUU1Diz75zuze1 zdVEe2iSmUhx{%C&yqG$z#A#65ajN!mAv?&0y=BVY)|cWW(cSK_t6=cOT?)Z^t4%rt zOPSx^^aFkWRFmaODMk3EsF12*hI_g`dVV^4A76LZSXbbt0MU)bmz%0&Y1(z&1viEA zEf?q3FYIJBRFpw97$jK0QVu8Sk}WjWc8r749#yd2qe6avgm0pdEI@>|+j#MX)=H}P z;M*_Q?(rAwEM*(^6nma2Uv6iML5q-byxsPYUVm|`*_Eer9yR#N;47W2;Y#q(zM@g0 zl?F`n#3d%+$T|V=IS>++W@qnmTwD%eP{ih9z^-%NLUT z9={%K-|CS;k!@y(pcWiA_dNY8DfNCFwu~E{BWW??V4pm&$nFtT&nGur%5(`GM_QcI#2ZL z*2ye$ySN*wX0n(suaA^>|82w(Sd+A2w%m~nT2ykuR`74F4i4_0i=#LOtx36$6*_B6 zflD%f52dzGOPK z9#(j$8{NX|oTawwlu*YibI@1g&&^AxFai#UQ7h8X&vD5Uw%~rYL6F(DFRIDcjD^*G zGAK2re&A|#5{0!-VvUoY(&JVosNh*MfxV`GJCWv{GI2&`5;}@9vctUJH93O5N1s_= zP#>*IQ8ZWX(xwq%&IwdoWw0j^`Du4IPj%hGS3;O89F{m;MFdT4R$?dYce&g3{%wZCo8Nl0Uavw#S9)W1) z^rCEsq4LRuhSyvU&T{7|6zWegIhA)<&dO<0YwGu47a2^|0jjfHQ~1z)c))z-^g-_0VD&aqyQkl8Tc}bF^!7i1^_fVx=GT?Dv}V zTwn1XS+Oh(CNa3bIGLZ=+#KQWNLFL$9TE~deKt~ZekD-m{3RhDXW4LZ-nj9L6(>9r z8i>P+Rq9fZjL|j7TZ*ZC^hlzslV;xr2Sj2zVYQy-7}iiza|r(ZDnw`Slaz&j9-AaS zJu`%`y>bb68b|044GWk~F0eY&X7I2yo8};id{+WyFTryPL%>RUkVxrPIuh zLEHHXa@TZmBaP?bcO13`>mkH zDI`~tPp7o}`7;oEy6#2oKE^b_I9amoBey{)8TJ=7c#fI`jSKJ@*D6B~zYck1EyVg( z%iEKtt$MmNf3${#I9%UPGaC@9kQ&FNTAl-YliOGF<0cc)RGA@cjf<>*c0b)FJDQPd zegh&$zCI1;OD*v{@C{6b!E<)8AmZW3hjgJf`@TrcFe46sv&(*gd?7psdygRd@(y%= za@-*kyQTNu-Glx*M8W_kSaxe;ok!lBA?M8O_xyGV!%P+gG9n!SQ z4zK}AZ@5@-RqMYJA{>!_#n=O}H4)}sq_e|h%#?9^T%-z%0#W;&oq`}^Q}Nx}27Le1 z@M%JTQ4ad5D#1^gpJGyy^F$pNDJUWAY5s8(pX9VQ48aAf2YZfj2Tll1kPALr)t`KV z{wXc2R*@Dk)hoW!G4noM|FessWZY>J)A)#pcwleF+j)NtLngX^5>2YRI!bQlIQnv6 zDo`elE2G?@TX%Xn?}&u%7#)(P$r#j#*%;q783Lvu!I0wuwP(Ip!kVMN$!F2gF4eed z$PqgDp-Ge%n{3|Kr=p@o|3x}k-K#ex@WykmhfxK&@4Sh6Cu;k%i>^*_adT9UT!?`X zv%134H*Vf$HjZ?E(|~0sf3$hdc2}b$*!iqO{k&;imHsrhj^PF)gC?dOF|w0dsEb27 zop+69$X`ekG9gs9J5Yut6+gb0oXEjwPMIITZ|ejZiDY2lMNs>>`9e5EO?Ks0^G!=# z3Y%T7=`l2^R*tSx#9!;~W8NBjf?_=F?0jx&7ixLfrQQX9#+^r>*l=$*tSgef<#FEf ze~mw0l~6#|De)|sf1ZxF@apnypUr6rd?8AZW_Lp^I5C`uy5$R@!oXA(FqabYbIwg0pVC=!rRWLrUg?9P^_!gSs;sg7b$a1nk>q{$A7aMX^J>cpQT4 z>GCV&DPvH77#zH4U#MdlgVS=gHc*?NzE1nSyrjiNS5Mj#6Ij!WkxQ$fZ(2QA)AS{= zS`=e(wcP^LCTdgTZ8=^>8o>qQ`_ZVjuwBShc>&sN%{*y?hhNQN1oGdnaMa~ji!mw# z>p;!4<%uunPJ`(5W8WiobN|e(rYFCc$EZUZIkl20Kyo+Vv7LL&VwUf|j z>mdMtKih2A(Xy;P1TEn$j5t#o3HxUg*w2nJxD`T&X6T@%j^zaErkF|D$$Uh%Hk}-T zUsi-dyRIpp4Xf5#&PCzv&>c4P(lDCj07%1ef)2tWX*4v3RXK4e+(EEm)<(#Y@!C)h z_)rU!cVERL1K!)ZmhBu`%Y1=*E^oHFar2&kwb`hxOsX=h4_qKHS5wEiXek$8xNwI@ zwuyJ~vbt!45J!bzD>>o@t?{lSdG7kni_xsNVK-Hk0dSa3Dnk5|?5M8K(VXYtCX;UD zUlYSjhDxL+W%h#QSCvi%S}4&)^!EdnjrVLejILekt#0`~zowuJ4#*3wQtCZxtKbBG zbfe!|s(yuEtvBn#RS_n^_wMnd6WX90T2!N#sp?_Z(W_mlcp?XjUH00v9$L{8groy4 z15AU}7!;rjwZ6tc4C*b+SBC3-X+`htzQ)G-%~q8cxbfZ&+goj$Ti*(oF@tU9+K`G>P zon1_ogu?8o|u$lGVfG8H(E0)-;D z;bO_M78`pc1T=ku)l+ufjxbPvE+avqsth9rF)5LjeAGtX$W?wL5Uk7Q+@{v`7?Dm1;YDPak=H>xr;T`g z*I0N?jK{M$d|^~rG@}&Ix+j5%6jb(gx1R&IJ_pjE7$WAoh|ohjn_-HXJI3PZa3p1D zEFMs^gZqcBUNN{Vok-!au&~D9>9xfx>AJZugTtt+y1CyRjr%Nr`pr;Vfu0R-#w^(_ zq(zyJzV)osA>z#e%fWN?WY@4=7Hvt9c7xE(dggoisHs$NYnJ7w2r`=uY2PWaQJ+?UDpf8O1;$~DUc08qH zwv_Hkhc7uLOuf~A5SBLSUwa%p{a!1lsaI{lqK}6C>N7IhO+(lRoD89ywI?VmZqfT+ z$O|FxRrDl2i8^3QHP=}rpR6WCy!=?5NxS65r}IWOOEAXPG(*|f;;oVG=E2b2zsQ8TU!vD5Hb6lNOI?z za<2EXP}@;lYeir#Z1B4i#h1N*YAj(DpA*GXP8ze#OC|g1-pgMcoS6Hp=vho2<~6&s zaBe{RPQaYFD)iIoqF>}GG_How0m~En(ax-~M{x1LY@_gAk%tIWcduG6*jKo>pIEo$ zu-CK@v~wqaD|mUt3qe2QsvE=#ak+zuBR80JNXLJ-1XAekh6_M3@9EU@WBicx{?(Yq zFimU5U^A*I1+A*Psu=mp5JB#Z#;t4p_d1ev8P@6HZlv$?)HW-o#m2nbys@qlUQkW9 z72&{+Tef|X=Y7dh1SQ^sl;dC&4Gk-)m;8>FF%0Z~&Mi(z)21g|P(SdjmrQ;Z^YOV6 z-{%sG19PEtglx9bY<7r*v zpg>T6wji5N2pgO1+W|xkqJYk)G4-}kJ^A`keLCwZth9z7!odJwZtAZNT{#-|kw?;$aDb z)qCJGU}V9H_>O27N08Y&>~BJFJO*x ze(E-SA?~V0siPJgnV!l(Gi25MrJZ=vYgP{Y};*n4VF}`-i zz)z^}SAlUpH0Bg&;5`@g7+|P>8VxDoQ#51cqkgyDNIBZ2K;#In#`vW0tpS-LuDj6+ zUNZ8Q;&#e3I~~q}HTy=DK8UEf(calZF&gd7iv00afyPy)l-DpnEPSEVO?j?%VZwHp zQgkM;7&Aj!Ll!c|max3gg&%3Xu=aDoP?Moh!P9rjJ45S>$#E2}_W`GWlh26jCk0ng zs~S27oP?4hdOI6HJo`BRMXtFeai_NFClYU?dnkAlPgMd9zk}{TPk(n}7Xd$~!DCTJ z{2wHxd!wuzl0{YuE5$22q60#!w{eY5Y&28TnWZ7&wqVJjc=C*S&UY)QD?AEJ%eyh@ z;qs1;`O-S2)w(guVBHt zY_aS8&iP5j%_z>U8`Llx_6`=n9QV#$)ifk(ax=iS!J@p<;rg3@?wQDB5G=Ac17KH$ zxNg=cPuIBH81HW3!^Vb6Bc$!^Hvx4*%*$i(Qhv}y;=a%@N)^aw>RN$Z$AbfDogH@O zFBS{`Bs74ip9fP!MQMTz%sGciM|8!{_mlv|bD@tn4iya`rNMcgH2lryxHen+lxG^G zUQIZ@_T-I2W2L{FDbW22WXe!;IYQad&#+JxiBxbQ z_XQ)0q2%Bd`y7IV*jj!=?QnZY2)27~EX(T8 zsyMZ>83-@F`~nM8bDRSJj6ie0kZb=;31#3c^7#UGbzeNfJg8+U93dL9j(->0tH<6q z14WyXz!`l?TpHF4eSfGb){*Y{roZe_qfhg|5;ZfYiD~F{f0p!4A`B6+M!yod!@E^A z5y1w+yr)rsA#U`+g3Cd=;mFckgGX%y#j>V+bAKaNaBtO)u#4FRpFDAq(PAL(wIu&| zC7>nQ*HT((OaHhilRL&?*Wi=?GfZ6_y>uV+y{KoiJ$QW_J2e&qXVFxzy9D06T1oma z*#XN>Dxm9kf4r$C5;GBZ*5hyJjcWARMT7){EP)w3y+t!UU;%|Em9zZC43EIw6&RN{Uz4uy((>aMNJlr{(u3KhTa zhG9=&q=JyE_$3a8Q2CWpRystV9L7lLGeIAxV--cre`K7ETN3`|Zf5|OKzDa0nTv8} zNe+|Lq6J31XpRq!MFAdzg=ZA@4|sa9vtNC0OCB)18hb*Ry{u`~y4U_2Epc}(pL~3> zJ&#ns)22~A{-mcs`03U)0BwxK<6&O~zt#MbQ@`$w^|ET~~Z`Qs* zhkuZSR$HM~)a7DR@1&EJ-}lr*9Gf2r|$25;UXH z{ihHsVkJTG0<0u5H?oqBzMy}tUNDjX40OGw{Jr?G7`y8h2hWU z1ZsueO#6S@PwW^qEkHIcepNS)yGnqBy#N;+QaeVwL@HhLCO~V?7r6)=>gr4Bf2Buw ze<)($@~AWuDlil#NV$+Q5{fIdxbjt0|ZOP^gpw5Jx1g*8>UVY+FF^xwNck+snjHKb*|_F z$1LIY`%L}YI|K4uM`B%u1-;~1dFKN$wx$l~*7_dLczYpw#&ZQYPqvG`@qSkMe=?&5 z@}y#l2?Bpyq?9}7O5SN0r4cJpD`;mPaT2SA!)2x^*Yxy`l;MT_2_3-mD_lV!TR*<~cNJqUIrdam$fpiWZc> z8AJ@luGGMT4EOe@H-@pGTTobT_eVLf2OeOlc+IWC4~;Kc0+wT}?11^T%yF%(x&VXX zHDXY+l;k%8CU~zj)@)o^>RB8Z#;Z{bn$C`pu#>Mi^H)?jq7}L$4DHtbf4wpUNWE#D zcqLCw3e)V~YxCtkysenoHRA_8oc@yyplwH;nhN+$4ZV{xZxjHp(X!R*=N@VU`G_0_ z5Si4-ePBt*_U*x)q00l7I zMOH&3LIoV(a!}2*>cFybe|*(IgW+|(ls|_QU2n(44MMtU?lG#N?FZEA6L*v_&x1px zLuC2;MJ~~eg9No{XJ-t&BrCqe>^#fhTKd!enxlI)Ew7zlfr6{fs8#-EvSFmurPw9q z`*UoCY8n5O0l$X?Ra0*Ch9)L7sOFp}K|h4QDnC}{WxFr~!%$Zcf2h#`9}PG9&H`^W ztX4+a`xs5m&Ac+sOk-*&?q4B36o}^LoJ=;Tzt?zyYbEp<5bR(pHg;&OS73E_c+MATb(%K1tv_nX76sLBCUrkfV6=~eL2bCz7kv>aQuw$~ znT0sJMZ@_cBY@`af3oDDjq!fdOMfy%+uzX;hs8%(48}Sbh9D8jh;IIz(8$j6Jss!T zNg33H`Gp}eA(L5#-J87yjW=rTvsOT6w}*Zr_sL4pT!|E~yh$tu>Xe3WGtB&=Z;vwH zB1)H1oApNgt0&%#RPgrZ9kX`lT=2avn~e9!0ZgNX>>{T}f5ka{P^HYm(!!U_@j<2A z!`yqOVGgeZp)rf8n7n5-gNDw_=sZdD5@0yPTolvMN@n0P_tcwpXhIYZS=8b+!l0;q zak66@wE|o+LZR{PC>n=8^pB$0J65&gYae$iq3y0sj?2vFRh{~J#{}W`*(}yZfLEd- z7JIFw8#__+f6Vuu{vycQEzF}3WDC&3H=F82!Btj_^#Hip&8XA&@wY7xVnYx3p`I`k z5dsXWW%+I4!n?!Jf@aGYYN-4X!TjQl%x|5pDqEq8p|9f}`ATAp$t&eiMcr_EtK21s z1J9K-70HfnG|B?4`p+z2X0oUz@;_5=D#l5G9OdlMe=B5V2V1$V$&QB*e0gF#ao9X@ zB^!(hv%#7ON!7@Wox(S9SI97-Tmo)4o|y?RS)#yGBY$ukIMN- zfQA;kBZQrp3EmQgw1H1O`EjT0v>=KU|3|p1fVpkR%vZ z9w8(Kd()xK;ga{8oUeaOtc-gyYlRwn3Egv_e|MA6I_nxz7^;NfX8@s}Q9H7Y7ZV~x z>S%Dhd`Nm@-CImn0H<$J@Qn=!vp>J;lBhFO)8J%;N5!rzy6Fw#+f{olo*7QGcgSQK zskL?wV`k(1^d?o@HyZ{Iy~tlS4Ac+obKJW-WKR?O1sv&M6?}9jx?qY+S+-5gUE8fg zf8~2wKjDzZ))g1wgx(awTn7Df#}~o*eBVC)2mUSH`|6_q~K z@#8R=2DBV+pK=&mA19-Ye#gdfw{R_*7&f2Z>jV)9)f33*ht|jGaL?q(g;)YKUd~<{ zV4F-exVHBtMz^SoiN%0GwiS!>a`DH=f8Ji)eRWKo5yzv+mu?@}{Z5eq#$ zaAlQ5)2psKqw^`$vD+$0`gJ!GCRi8^-r7e~r@{NO{iw+3V!yd%5094;EVWBSi^gu6 z7PvuPSL>2vLX0724crDOyfz{He-tUO=fH#jea9+OBU1Znv)kK^gKt%EV5FnWG`qI7 z3A)r5@ULg$QaxTmob| z?G#l^El6r)1UwRWJhM234oboMH?ekzV=eP!~GUtDKL~LMO*J6eX zquC(f*~v><;psvWNg+QY27iQN=UMDGwhb3smQKX>V@Wk8*B*AR!u$o4wA~VMu*Qsh z8ovtBl(d5~nEfFny9;li>vJbV7*UYCF-v6Ct~p_pa45 znFgi{beHVw2i4S)=5ej!X@HJLkv{kI_~DTl*T38F=)u;0C)MI>4f>nU*ys1mPcWmR zBI>gG5vOr1_wZ>JZT6sL78XY54hL*IDBrSg=7KLKeVL5V@e;6)e-3@QB!$IA`DtZY zdB$KuhBh%dQIb4N8$(DchzOz{BoSNu;(VA$yI;zB=PavXFe+yW<4v}gKJ(Rvw$~Pb z^j{RUoHeB;xCgJbfh>J_#^ES*J^JQ#9FvX5=lh&g7wb$KLKnElk?wEJtxE^90om@{f$AQhN-U+AbFaJ3&l8zhP*#+Ljjx7FC`o<);n{f3(kLKQ zQE`Aj+<**4bP7~DAFqm`&bNv}9Qoz@)ull4GIUs9*Z4h0i5G(#DCc*DO5K=+*z6jh zGuNtO?SzwBe+f7Y|Fj-7MNjSC8O`q-fm{$UdNsk>Ii#qg0n2HEv7>*rF7Ce3-dlc# zG)PXE=HCQ;VCMeno+yn;<9mUJ*#4tptuyI*kbA$ce|Hnk8jo~V=o2R8276AyJ=-lz zk6-ekGZ@xz>lT^H@t7c_j_D?4mr z({S8?wT4tebR0}qMcBc|f%aaI%tU}Jn5%Y4;ZhuH8m;qv;9I5a@;CKROB0v$NPM64 zx^STcPKq(ZZ9Qks`KU&XvTRz6M7f>xGUr5cS{-j`V6#Tf*hc5Hb38sRH(gHn>YTd62_ znBhX3opSwB3&J)~wrqUgxN4!@A3L?6i+D$%f8Qp19+z9hg*oK5(x?nMPP(}4(tFh( z;XiO%qjrOO9DbyCp5-Kv$8_}&*Q3{_v>twnPSDHRhz+!|eozH;QukJBOh>f5G2x@5 zT!HHwj3TBc4v!uaBaJbb_M7w@d22DhuYyz zd4>x&%#goVI_pYr+3GQV)NDAMI7a>&L^jTwMw1GXV7q7}eAkUd!CX4D%L;BQfA^^w~_P^Tb%JWt4U zXVK}vc)>nOE=3Div{((hpU_D70(`j!f!IB#LcWAr_$k$fpDb_?Wy}Ugf7Zx?&>Q18 z-M|t}Czt=T>kXM1tJ{UH5qul$SHQSA6yYZU^fXu~zaW!L{;j|G50COJc`7-@vKV+Q zX%W=;5a%W28GSYW)6H4 zUek(xr)T;8lR?LFJ#miV@wvssW=G$^qXPmt`gM?e05B|G!G|PR&ZB6NcAM!}C}At+ zc%z4dB$C%O^n9#@f3ZFcUS3GMYcQEKYv6ffUcS^YT}%1m({w#w1BIeJkjmi8Awv)? z9o2m3K|$>U2FM{rC62W91A6k}lVO%uCCR&!C8^RpISKzwOZ7}124-St#Wdol+@KWj z-D$+6M1l_V{SS}d?d2G5B_9!JdC(@Hma0)2G!tD}+qx0Hf498cE1z7GZ)*X#=Stew+&XmJqx zdl?U@EKg$gy?W>(4U6KCO@H!#?gXk0sCR_hQL8-sWk(6yEwz(@Yc>0mjAF3!np>Y$ z=hT=*4|7i$P_rleIn5uj2AJBeGS+hk~_8)&T&7=%BvQZ}q zM0_|E(e|8aS}@~`*JJ;3+4yRaA5H0+%k+v>y#@4kLuSI#9;gt*|99hH$|-buxO7L< zq|aeWBeo|OC|&0-TgkGew=+e)dReYv?n5eyzt@~Xf4=&*1gF?JLCV-agIq6xd!%Rm zsiXznNEbaIFA0(?UI5MSLKffF7!d*_kdSpC4&~db7XDMJSkp{hc`4fbZVP?3SCTBW z5a+X6{@p@Cl0>6%Aj2t9JVg{iDKLhuls_)?DOvzcr&5NC7a0}*^i*BTX~P`xo>O?q%I#`t1X{VN3T}@j78}!!;d`oChh3;Y&f; z;f>ARtvW`%ZO|)oyhrdYc|GRW+5nVfzZW&r!k{r*MAsqG{mbv%fzJLOeqrS2Lul{# zuTf<|79uqQeqS%`EM%6Ph?ailfcgp3GGZ-}e@P{1^(jw9d8>@atEP&HsQ(xGYr1#m zM^GcD>n2>4(~ed4fsmyh*>%>vqUqr}1iD0YXEWTkhft=9sC11!(B@?zb%(cuZHOw& zxbjS2_Fn}GA#G$=Z+E{`uLYnzVQ_%I^&6bc=i_u6a}+-3liz=)`%VNJ ze|EZI7!VLk5av74Xah$oY2dd4%;*r?BSHzK_&P-MZTV^p`TO8Q?|d$Cb-}K;2>=X7 zKYvPs_+*`T$Gy^@sag**=sbYGVCaHp$mD-sLR0_F?R1KlSW{SEe`Z3>ckCaRFXbs+ zcJw2npf*9>EfEx$Y#0caz!gzPht-wne>Xc_;(=#e-Ka;%>$9d6&F6%{H$;l60Dmrp zI-6-@p58sWW(GLGj4w8?s|bQ=ibgfO(Y@<#a?&j5Y-u_T>igrgw6#aS0h@{;(EnJ$ zgvm$s;S=-~Y&wKFjRA*Er7ZB!aZrSc3hF3B4?ofb5pXT$)$~OFMS-BUKk7%`e~hyR zaE@wgf(c_lhaCyJNS*m@@`%{keQulM^!sBac^aHlBgA#_X4S3fWMK4-R^7f8e9feW z*)sO~?{|L<;n0d$(E&c(<9#Igc<*bz!oW$oP-^)2eKRez34ib8msEV`whLdt(_+4m zi2}E#uS)*(C5sUqR@WYL1x73rf7^HY6!E*eMgeQv*P1kusse4A;rSCwAG*eBYTruicU3jK zB14-Wo*0e=TtI4AUmtpKE7I?v#ER#2W=Ia2-Wqtj)UxSW*q2fTt&g_3f4f=p;-t44 z7N$MU&x6B4R6R$qNjJya(^av;1)(&xI=Ya6Yfw)Y`G*L%r4n!D!=9Qy(=!o-JSL zW7m)^X1r5OJAq{X!&|nJf4|Wg4XhtVq-d2uCRA|{^l>a`%lJ<`o>EQ;kkL7Croqn< zm+@BOh<{O}`Qc8v0Kzj~FEjhMoh_MoxY+t;M$ccnDi7CN!tACf^u`4*yA$dLqh!Vy>gp`a0A^-3 zMrLMK1PTgukc$oQKXwEPO`wxA$li|k9}F=kpoz=7P29wP<(;TtZwHWdwE?iO0a!SB zS-5zanE|ZK%sl@g+B@+A#7*2l<^TmofULb8&>4Y3%-+G%31s=n<$az1d<9UO(EwO@ zc(~~Q4hM+X0-Zo+CUyV?6PHgw+xHdCOl$yZ_GTcUi|7BOpyvPN;^M%|#N_Vo&S+xm z%xLdqDM&+q4{!&$d;+KfoqtzI+)mbg6u2-79blSKuJ=b(Z$1s9$;c;{uj}I#KzhFJ>JC41Y~1k`cC*;x(Ps1 zLzdWth7hF=dj0pC!Fz|9+uPZA{-OVU z#7tV6x-!ygbpI~-zfMt6dk=s&13Q3$m5mL5z`@1^;N;;1`2KGeWfRc9Rs17T#?HbX z!1FJ$@4NIL!EXQV0P25_2MyqVb1B-t_bm`W{in_KnK_uv-rrdMKR5f|A^-n+{I4wk zuWkN+6-m0<*!=CM{ukl@$8TZ_vhn;k@02myOAJgNstH7Tp8qI_UZ55_?KDZuQ{^;*#VX9ok4%yEC2=;X6FCn zdmk<{>-XEk`Mo9ob^+gK=YLm}ursqa|7*xtIXD3(PEIDC2+Z%D#LB?|@Md|RMsuLY z-%Si)Vzje&c@F`+SLX|`uy;cE>qt3&IRQ+ff0_P;xByIIe-Jl-N&FAu0WeAY7jdxw zn56z7HUN{%AM_rl_y@hmDg765GXt2E{~!(klg57$&->~of6#jt(?94vi`jqCUpvKQ z{tx)B2KYzmU&zt*y_xka@)H){UhL=V)+lq4q*C!^k2yS zF8h9MV@ekwO2KonlSO3rT-{sr>f$!tV^pDv03xUc0Pxg22`y=BY?ss(#f86gU z=I}l__J8VUe;4augsilmJ7qpj~rP~Qlz zlDv4Iwl%_rwI```Jo!8uC8*BscOWd!;0#X}pNcdLeSjxoP!~S%d2#fA(G0YPYy zfGtkRKt;nW&TQtQ7)!XR!(`WoujT151%ub`qtP3~UzOBYV|2VccxC@tFrk~yE%)T* z?oQO?X_Gu%BEw;d(L!Qnmhe*-y9xp&Ba%LeE3bHQtU|7i_x_`>?3#Iv@N9?V8Th0^ z*x_BDQANR%Ir>L`OS<~UQ-PKv!P7{ndEJ~a6aUthG%cb?#T%?hX$~Ex)jfXQ`r+cE z;)1ZDByb!;iODZ|-=-+@V`LEf*^d_PZjqyZz@(z$eR8n2XbD%;tInpt5q%yTzp9n& z7qV2%kSlL6h?tD$SDRA~YVCbPjjQ@1a8t3XH$gOzv|{jo^`l?&T2w`mgiS$QWabmz499!3q7sC5)raC92O$!7nSKO*(r!r1K>QD=W{uKsC31!Gn;`+XUs3KU%2X>A-n5z z4Wwq~?&q*}>bcB(9~)0yEsAc;7W#CZ5!q^*z4J=D@&ST80Kt6ti*Y15K>0J$=d1|c zKs2(T{L6Bh*m56@+4JG@(`XA^n1@PC@up&zd$f7;wEoh3fvT)*j>+Gh{&a-R#lNh$ z`Y29+LBeloIJ=mm=rs7?v*F*=nZRZo!wEFjXC#m7?&8~6l3gUGgTA+H6bd8H76#z5 z;13klOB$*Qr84BCoH z@j97hAv&gsXkpyWxcD6yCE*dIL)AW7ad*Oh5iV)wFJNcY&AQvQ0JCU@brkQ;f)w+7 zKXJqsV45dm5SAU;=h<4%8$0m>*&rXf^tU)nE! z-I6>Q#M6~Gs+9QZP!w<_?^((8hyjzGEUnSNr~7F>j*aYMWm}Q?QM!E1rE#x~@uI{m zGaQmIJYzOPbrgg=Wj6#$g0Elq`cA464DM7_vns^Ch(Q_Fa=h?-ycYSnLnkyMdP;t{ zK0yVSC76evEwQbvNYATq z>d7X^q@v7!ePTti zM#EvU)VR9(dN)&nrw-FY5}N99hSC#gWJl8e2t#jsxqcCrAKIW1PWuyGa~O0YCO3iUPiGq z(4-tsVULIO5Cf*$uEg^^yN3}6ark~*{!06EWac3Kb#AHot_=s3={M62NBUIhTSkCV z($^#Ip&fWdMY}uQL8&WfSVz}M;+B!RMTKOD?(06wr{j5ZCP)ZISo@!Uezb#KsTe!$ z-KgU|KrDy*Z%qg+zv_MMQ3K?_&Ztw*y+sISoE7#y7z+W=k#d{9`QoGbU|eNmfMJvw zeqSx6ep+Seam!Nx*M43Wv5F_vg!&z9%^`3yOqr!qa0Vh?asM6q%?Gy(FDKMZtTsv+ z=(Q&aRks;)Y%CePD+{ZCMA}>V>NQ+2GEgMJNW)zulI>zlmo{a8vT~9PS5u7~5bT9z zd_d~+>R)X_TQg~XsDT&{1xv%|B181aF=6R7oS2pwGT-r6Kzm(4t>9sZBKC*|{1DqP z$!Od*7wTXDkCIEX9-B|h1&0$ee3M{)Uop{~wY@AaR?^;k)x z=I+UaK1(6UQnGJeOfA_*(p25LVB_Q^njd#oMuQ!@6h_;j!SwaQs|b-nH@$ zo9F{~nlu!c7&}MS^tT4M)Mbw#$DVFHW!xF8{;mTKk*s_&3&2^;mbVXMfI-a*x zRCX@(C{HO=>e>F+tu5rTK;-`3NV00L@m&;qrGBGipDHumS0XRb;)7t}7To|5l8x?% zZDErlphxOHp{Y>(p&;25b_)307_3h3=ks6Li?|WT>Lr$6sX0& z#bUJ&+U4Rrd?D{mdo$cyG<3SyQ&=7xAAUV30(SqXp(6BRz|(2HvVmr znK7w8a7@C2pVfHL2;`&5d(?zIJIO1iRrK7&F%Jty&Kf?I{?nd=^q6-fiq1 z^tRM}G#rpr`|&%WE7dR6Bvv*{J7ONBSCbyKx7uR z6E$<_YcRZh9r9nFjcJPet8Zop@s@a@BXNH9q@Wuh-m795%t*@As9SqnWLa67=1M1j znZq$s^CI-^o>yiQF-Ffoq)`+*IQCRr3*z`!2^t=IdC;$V8zxSg88~I_a@yClgf_L^ z6pU_Gxkx9euZz7*F9A}N_+}h;0}Z$z$gwW0wk1poUg*aJa;EG;m%^jJ4rB6W|Qht;E%l)mnK)?xW)Q5TVN+JyK$UOs7 zXD_OrQQrM&6D^U(iVQ(^JUWCIh!PIhEdg0#^{dC~B#( zK-A9rN4Q*`?g^#WG{6zFTZ7RH=gyf^u55Cf0UJH!4bk#XdD&7Bpb>y zEuB#5Lm@ivZnH3JnVHLTYV4|iMdY`%@ryZ6OW@F{m75j_RNKEaI5kHQ4lfQcg5_9{ zcHRzpx>?5_K4H4rZI7C;)+E|+J+S=YpZrn~wfGComx|pcWhQxF_@TDPNJ(a^jSD|( zn#nuf5X#Gw!Q5#Q&jZp81BA?jFD#rcz2Dv>@)VX<<&v@>}^m}D3mZy+@Ss2maS_}=oIIZuhKET1f#lXlP>x~n_nin5nH5Y zVdF3Se3(W;cGY9DxDPnR^)pQ;7dt2@$mRB-vT3neeUv@eZ9ZDM)DIyiGt%<#W5Nev zQ(Wds14RepA&y#6p(Y7e__gcRxVgN}xD>E%#I$4z{1Y(5>7K`bOS@J)iZc4p>BAf` zpB6Sy8T6APAAZOao6ckR(brlk*->$0k3El8RM(K=5=hL4Qt^Fw{mg;`2-^$I9`~oh z|81f>46r#QDx0gi<$541m@*Ae|1O+Lrb3wP?l5JQg`KqUNt@-afsg8@YzDf~u*4CS zf13LIxBp`whg2_rVhE5M%_!bv$`Fx|#0cIzjn1Z0oy?mU#QD|8%}G;f4W}>^4m{oh zCiO5csNz%bmL*e$M@PX|#o3?3a~iBdSC(SGuv7vou|7H=`0B{O%IO6?429Eo&)tcn zpQ|cQkiaTigZS^X)WP?YisO;DPmRM}rj)`Hm4`EJCgN^?)xBx;nbi>|i77@!SNHQ3 z^O5R>pqZ1?9Pr(fEmY}R%mkD{5O2!1k10+<*JfSh|1swYOs!wZi5Q<))tSA!83PdRKN8qQ;4W^ALG( z0WAL2*F(w%9snn*qi%_o4h6FcQ|F7LUZ8H0J{2Amo>w^yVS$}BZ<$xpG5>m@U1CJ% zlFU-TN+Ys};=NUNf4*lHy!16cp|Z?NnO5and79dPlIp%WEymUV)p{r&!x#q0?QYKF zNP?H-jt={+7VNSbk@R@F>`rO!F^%6F7DaZcsa_AdlF-2A5Fd7)s(aX0+lWLpN0AU& zuGYR!0N+B54Ck*Hh-7UDDt+_FBe{obXJ6K6d~X#I8g~NO6Bk(R?O1FOn`^Wx1{;Kb zdX^(cf~gO&-|1r_|1s-RFBiV4i0|jXybOXb^4jxCoQF?LXjBl!yd=}}mt7EzJ&g%v z#O>hNm)w@oGT`JlQuHZr6AH$jzEUS8*|_6EG-iqG(3`(gGvBy&pm;3t;%lC3~EIaoNZO zu-x=6yy0KCI5N?~72?7A?bCi#Z>JDF1^p>x)hnpn*N zgh2my&vB$Dz4@nkLw-_*5*G5W9P-JV3h3xE41%vX3&}FSz*xe}KW1ad^8BLAkcor* z{IL!&J!BcX)oe!FA&L%8S5}RGy)LO`XCA8HJFTkYnqjI8*%DOkmU(CQe4R#p&X*`v!hcxooSLE$|gKCzN5Far}O&bA@;3=?hBS5uGkX$7Giu7RJ$EN2%%q8QazTV9)PPoF51g1 zYU&Tdt{H$9)h$P(V)TrEwsf0}&a6U-YKmTyIf`q`s1&)~Q5C`|VCf`y-J%g9$H_t< zTjwgy%l&tfOiDa(_@;G`VvQSYk?!iIhJi6x=jY{Qrq0_SM?v~Q0$scD4+CXcYzO_( zz?li^FLjo~%l1zt*qs0Y@-x+kmbbj7*Y&DL=-NCg$X!`pGph=Jwvr$VRs_YwCSEo% zv3Ncuv~f2wet(?N1`#11fu8v=lf*f^QE#lrM}==4krfOFgE*+Eso6fnaw`edjF{8=g)FGyIXJF!kO?|ybj}kM4SY> zOdXvm+5qF3aDv%?#A^y9%Qg*iY0el=C-*#_saPz~JOdbLrQ-E3exDaC+Sp@;%tgQP zCs8gIO>EJk{h%v0Hk=iylzXBU7yy6D+;AMQU=oMh9yGPs+YYm@7E;GCm2$`6%M}p> zH$RNC!qAsRi`yzARukvVGTIg@5}CfGJJj$4wg%rxDS=vl3J#ILE**n&j|8hTu8;?` z$FXIKF8-o;S5HDr** z+uSLo7i97I=O8v^Qn_-kM{6Uc3?bg@FFdKW$J$70+-DfxISD8WIXYFc z?AF-MZ5LGJPIOr=nJ-}nTr?pxTeg%@@>&xr^T>PhTuC(54VJe7ErFp+zY``Cn#mZ$ zdpqAO0O4p7SueD^2&i9i6~=Fac)5sbZ>$Jxv(h1dkt#@=s7j!Eu-Mu>um%bF^_&2| zepEvJipa-5wTs^{%y4cW5Z!a(U)B0}+MwmU2G5r0 z+m9A_;Ab$cDDpDvL0W?J(<1qT;`MvR;K=<~_x=?ke$pcWUmPh%}zbY6(RV}!w{ZWWqtI7rv5o; z#6b7wgajhLA`C}Im&wzY!y}|{VseDvqm#g6TwykP5 z$Za(z4_@}H67EOA7+q|}4i!Ic!QNAH(;_@oX5b(H-D3&+gU&oMHJ%a}r0it}_dIu< zI(%h(Fn@}71caN0^yn^Qo_MYSSlHBm+-eHv(cZPth?E^oBf+Bo+QIGmL(#z0VhBCQ zCv6>hv!CrWKej;@gM~suGA@w7BU&#S9&T}DD&TOe7H7h`1DdivYBoYBYlPpoRBFg;u=lHmEF%ARL|XCAem-qu7w96euD`jHjR3kdlhy+~hWYT(br{g>LimnwOZJ@3h#qwFc( zx?<&R!X`ihXUb*ptjkHaBW(z}#@UyTt2X&XznZnIS`K$;71l}xWS0$WApM3gGxr~S z`u(YeA35!b$%03x7#~hE7ebPMbhRx(ZX4g5cko_5i1FdiE;IAw=^Q4U3q*-n3N~&}Cd^0G zb&^|oO_X4jTYYqy6|R|;B%CqjFs6(wvm4$UNLni*62Nu+%|5z(QBU#?ut#O>nsI+nclmx zU8gE@N9|FW?{5Bo0BMdf;`MQKfK#D5R@fQeojwadB-22m`MDi4&zA@NI+hT~yOrB# zx#+prsD+E2X(unK?3=?~c3c*AJS^DO!0=G{W+H^`ZQEq2k=by1BNuqh02ygB$}dUO z1h;$!pFq@=+ZyB}O9U%k{lZC~FL#@a;YR$pyAV|dX>Wpm*tF3@H@vu|Ww(ViQ=neG zAHeEVJR^9GD&>J2{~2foC)DiM*)%c`f6e(#9^Ur*mR#!lFY!5Z&lP-rpVVmmW}C0w zt@}Q~Xm#r;r`l>jlnP2-Vv^sH2lg6B`p(PcHF(oY4tw&YQf4mYd^s0Fo3*Sp+F*wz z4Nx0y+mSMVG;f{Z8$oaLCIEB=q2rHA@yCVILV8&0l=i=hqC7vQcwx^H_JLoxaCwgn zs$gNl7ileaI1{CH4>3Q(Z2QfOdoQCHTROM3%ch-boQdaJy+6H>M0weVrhm6>tQlyo zR#7#yG>3Dj7F>uXd(2Nj25c5U=K)u=^FqxUN&32fA^c-Rq6phYAa(dPd0r-u9F2PD zm1?I1zSp&u3qaDtsS-`U?(|3F8=g)F%D(|XeuS1iJ)G2;GHOjsYNUSFtS#U`F40@i zjii<JvTVFeWjt>s!G^o{P4W$yl~kNw$2Ci37Rpcf~#`9dQWQ9*3`B87FdiV zX$-TYEiUmJA@2U}bW3?%#{XC`WSy0{9(=N1nWe$7O`W z<_TU+wTMO1EV~XQ;Xdi~9<>5ohY@O0Fb~Cl^X9;A(Fv5%1BPrbRrL1ek34Iz=2^bu ztPFX3M`o;B8N~D1>U*J$c~HWX>on7xyNDB%DvHWjSMUuPVbOdBr(Fvia!B-PFVTx5 z&*)(#z$aVcA+oOi}xN(2B%fp zMWIElmF^3)bX~|Bck&{{XASEke8js3w!DYv>8O+15s#>B86Sc?LESO^+zVZ2E4g1J zqs)pLeg{Z!lH9EE$fit+(vl&6V?1)@EK!m*^0=|w2b=~f-pBCvNtR{@t&_rIHa-vL zCMS$E`)=hJ&Qv;_=R;dy=c9Qf1z0FV%SIM)kk0H3pM?VAhkqRcmt>PN9`hUHvt9dD zHqaxb@~ft?r3WinGy17sH6M$Vv@q;`vbD}Q14u0Rq@NBB4qIDIIF50S$WBt|(fT6ZzM--7BX8YD;#=0J@ zywHu6CMXNn7S|PlPjAx8+HPuBm11j_$K^zx>;f0{Vp`Qwi#gUo7*u$GO&iw7n1fgO z7-adarY_CiGem6!{xR;&ilon$HJ z-JyfOk$-t#RN&Y+_gqQT%tiBpm}L=7q3FnHUPg z{m1ZJx=gt#6vl7W;dq0d-$yfxZN3}}bQUcp%m^5Mn=0ulfC(Oddr9n~7>MV-^c~R% z5urv#cQW8$rA^c5*pvdR(`_EB(k-s`fru$W>#S}K|7A1M@q%D_LOW}++M53cW!{AZ zP#9Lf6M%eJNK{2`(E4-;8Dybs!O~iBo_ER3_7qq_wP7cXT_cWc6SMtR8Gee`&N{C~o35upI#!|s z{5k`y$a@02l4nYg_e+sUc9i}E{F$`;TP_nn!Je^Ip>owlKmq9b|SiN7bT;a{Pr<W<3VD*4h`CY`E8v%tPX1&%*!`+NNi&HiAA|NT= zpQ@(yaZCwXnZ@H-ViiFY=5}D?>Y6S-l!j>~SyB3@(XhQP;xx`|A`|qt zFt15%obs^Dn68qrL1=;}z&Qiemm)0EF0WFs31_M`(mRucfYuT1qc%E%S2ZC@v>6e4NceY!H}4Cmix59Gn5}Mn{&0eEsNHn zbg--2=`xV9S09I`XATr~aVJz}me{K2JU^85c#OHVhhzZdJ_Wa4vBj2i{q#K=gVcfXWHijq5r9W2^Tc^T`IW^}A zw{fLu!3~ih*y+#xCT}rM&DDNh;9nx_;Nr1*ME`|)Nr)`xl%8osRoGhSL=(p5A}LAc zJ5W)MiT61;A)#bBk;Ec0a^}lnO`7Ujzy8*kf0K9!7G4ys5*deyrXO`cO67^noq@sE zz)d#`C)q)r$r+>b73BO7ZVe8NlXC?UqL69VL)t4SD+=XcU>mE$krE(eL^sFC? zSO;T`p!wJM0>E^yoK)!6MjIJ}TF!3byUD6f1on)YNw&TekC5g3MpCP*>EhCCf1@h& ze?%8w2RveE`U&&Ztj;YbVFoaGYxwM>bPs)=wc0CuZvP6IKyf5oC@7{h`9*AiM_*f` z%Um~1Zc1@%UvXT$fJToGkeZsJgJxRqF1i&+kVTScut6<5PWAIs>=vC(>x4&7vrZ&N zqMGKp>H%w|dZT0g#C^Mx#S$ruwKmw%e@Na@-*HrxD@T%o2_5E_P zE4d+X!GOjhD{PLamq)v|#jsAn#|(0a;VC)GK9q=~k3G8eXZzvEfg!9)x5cXef6~ba z&!{)}Ou~JJ;+EQ2-Afow>Z{D^^j&FHHA}9Z^Yn4=R8_XvT7=j^PpKcwnKaFRYn{}1 zHh`}o3a*e8;M^I~Q;O(f5@f49_Ppsv12^(zV3W%3<4sgDIVqlyn*tvAkZM;{ zlX#QYbR~_Ac9Cboo-vZ>cMoIbe@~e%nopznx>Yn=)pPP+`gmKwgMXwkw&d6-K=|D~ zPo&}n%Z^kQTn9&w46Ai~I^wkiW@05cJE`o74%}HjW#7cRA$UG(S>x=Q_K=DjD+P;U zl8?<^F45o~E$V`5y<5hX<2zw9iM9vwQ@tv2n*yWsW(usi!4!7J+_$sZyG&3tER zM}`0({Q6YeSm7_;32hO8@Ee|gjRhwC5m6}>t&=0%(_{j7Ram7_vRG~BzP#zm%pQN3RtkdeOM>G0_a=uf zIGCRF=Z<6**^0U<^+zbf#J68Dta7>YIa3x7QuIedrPbVl9ubWp_lUcJ#*4yyVdW=)~@wZS0jbI6t?yWAte}2DY2lMaN4{k1v zO4o(fLMMTL5LdxW6Na#oi7s{W3Bh=?)suNz&?g!~Qu)$aj>P(Tt#duLCTFUPN~qAF z@gUe|-_AJRwSIOGWjg~7RBH(@$5@D+Ld;LCpGqF7kW?ibysAV{@5#mBB-H<}07 z1=9Q!l`xr#mMRkyA||w~V4NnqT5$MWU-b(jf;mkBQWns~?Y?U~O07)5l3*)K_w@Ps zy&V?P+D)}PFIFVJo-$KN0a_8UOFe$coiH7Zpm)s4=2_BQ!aI$KqRo0#yYSVX`x%^9 ze^}WQKk6LGfAC7Ov!a&(B&|Kua*Fd`4{Cjp9N)Zb{w6Y}pu2bN`;Z`KJSG0zS={i0 z8>3Wv1>yI$#{#rib5g~MGoc27(wm+ora!fZ?|e@+w`-}H?@}GVp|Cz!ob~TR6T8aV z#0cJS#FnIPgS2d@>jwjCKH^b&4hB1c5|Jw0*8Y-Of3_89Z@L>sp#Z+HNGWmYdgFv1 zJ2-==vKeR-(Pvln9n66wAG;w^X1|bUe+%uqfGIZ$r}gxChE3oWrTSw3s^pdUA^h!y z;YT8(cq*son=7s}XuZqN&&w@-EYi_8netu=(y`UY?rQD}?p|ay_?SSELcPKibpF{Y zqJ?Zwe~#S{G^X!z#%o$^<_!Or4TkIi;0-()utZc~L= zu>JCmPdA?b@gWjDj7fTle(;y@{@Fwz(RB0y`~bh7x@}+C9MoZtJTHMmk0XxXnW00t zPg0RAy9cfvYMgOsH08tA*Jx$};u}+ue+}X0`5mNsXaDW%!ae4lIg>_`(c&}xahvL; zCe_7()S4j%mkvv*Cp4mR=)NjC{vRU==mTDG@kg^NX7nmtl(*m$jN%MPX+8&sf;<2f z=0+dYmkw1SlHCR%4w&Ow(fLs!UcNV3CKF}Gn*KE0&Nn|tOc&~ioxK|lzV1Wie`m8H zQDbR`=fg@^0NQ;u?#+YMmFtc%gHzPX_uEX9T;CQS9B+N7x$Q-Ttsp4bs72y8kB82M z!;YLuVNU+(g>Lh;gh*;+41%BFo{Vi*=v0SbZuC?-!?_?G?ZeNysu<)4pdh66+PMN~ zi6VcatHXf+UhnH6($`aD(xh!8f4YR3QtiO)otjmxR53`z?YdXXUblRH16B$vY_t}Y z`i#fE#G0m8WnHA~G#Yl^ocr9o#oGoCBDX;uVmCaWa&`ACJR{v<1&&aA)MKw>_T+Bs zjDXe^I0Ptn5hH9;sWf5Qa7vJ_omKXctsr*pmGU#Kb)YBrA#UR-CjMP0f8l4^+=6XE ziz{@-j=-GmVO`4Rk@O7B_u!W~mL!=mmQr zM`7BldWNPbSqNgWYV~>Z+>2A>=zA=w(AL9Y_XGkxv^}JT=#ZEP^_jj{@V2RoT*ee&o!Ck7@F9Ha%iwOUmNxKmTJ+BXm><^q}RzgE#08j zwG}IdTki`WiLLnwPEB`wKf(DuRe;FmYlhXyKTcLWU z|G>zQrImyJS4IbmSR;Lp5By!R>r6!>?9LEUN{vP$tBc4qXXhbI$h1d8?c-N}v0)H_ z&Dm!f>4FBAbxd0eK5yT;OzD2g;I73$xbTUkdb{5(=|&8WnH@chRUh?!{^E>=gUdqq zkFt|&n)Y_Is25bnf5n&D{7J%VRB!|>wZO$75jyypd5HA3?QLPI5|GMF9L$u2R8NZf1~^q0TnDP59&BMkWvb) z#>zVy=KQ8VcL2fy@p(91_e=wrY1c?gS;A|?xp-q-oG&n0JhMSLH1)zy7+cAVl5nbm zeT6Lu4+|AFkEtG6ynN>(v$RE!fxSeio4XwqckbxEl5NjPwX}|o z=IHu_%%i`Ae>vkzYow~7y=v)J&9O*51WigWA@aT)tIYY4!s(&;2iyKk_V0?%pV)i# z%X|iQ&k$CmNG%*+-zrYHY24Jem2Qx2x;Ds-Vd)Ul7&UCcsX87D08c+HLqSI9W?`58 zVku27JBNCjY+_by4JzNe7*cq@+0IWAugvG2ONyuse`&sml|fbL-6Y8jA}j+!%btoF zMhP7POE|^0qS~5*+c}ZKeb$0??>T2i6N2_vN#Gn`h`&B9ewj#q)ewb#fcG9i+Z+34 zSaGYuP9t1Uqn&XaLx*#{x)?!?lDIJ8H7Y{qH)<8RR(V zxN0PPf$DTL>r8D|cs(xhOuXX_3zBj;#jvKk(RCJBGtSYq*n%^7NwmZ1qK9w+Amld1 zn6#w~r3|@gO+Ow~4~L|c>T?Fx=p}_Ik8r2Nf4yRnw{kZKEsjYWfFV^24}INjE`HQZ zNCLvLY?5*{!$E8>7aK6Z4~EFWY-!ZQ`rlDCGE9R>R6d{MDHbdfAXs0&=vzzBzM-?m zQNM(2nj)GLo|5kotXVboweu!P#7CNKN1$#_k)#VR{chevwjnTBxCFksjXP!UH9=yx ze=afeA=(e3IE)9ogPHMTR|&Mi{U(Gp{-!5(_Is1o8>3z}h3BZxFi0 z=Ypf>Epi`{`xO;qIDR=J28ma(@<$b;SR%M9(zf z_p4c_q+nZg;xg`LPOhd#4ZDE80rhm~?l6psw|H;2izv5Q7Sle}?_Y z`i4haJ%f!MI4l9lAahz<`T9j)qDw-_G1lc^PFZ!>80_m3ToUJY$q4qXqKTHtIEtXq z{65NeN6`k(M{z-XOOO;y(5~U8yY&3nk>x?N1E!2!_*0HdC$`tvXmuXXb+ae#QfZjy zx8w1y)%97m{|DrxeAWtOZe(+Gvw4eo1`9DUGB7zYH!w9aIFtR67z;5mGB7zYH!w9a zIFlff$OADTAd@jwCx7i$O>f&c5WVYH%yAcsXhiB82ny78odjvpz;>Eq4IdPlu?5JM zD$%LaU%xY?HkO?h#rC`hQRHyu&6`JJq!2s^AWcAk6ah#YQ}98`00fyL2|_}ccp%si z(-46O!vJCsNfbH>NYA6WM}Z-Er3@gQ^K%DhFSP=^%T~(4VSlyNYO7m_DBr}viE67| zldG1J8^euS7TKWM!`vf79Q$tI3GloCI!c{D1sNxB|jnoe_zeim2P~Y$mot z_alxlf_M0#l~Z`~H9b9%VCX}kr?Z{}2CO@c-n@5u#(H$7@_^ADMR@FOJ^WNqDSm43 z?1A)lZ2@*}N0WKc>q^`OMyk!5qSjSoyTWa@p`Fp~$A{7NU&EV;@W@_fE9z0!VPN{x zS>VB$G=Iu?DQiOA>3y4XcZyQ-9A@>UDpqS+CccB5h4;<`Z(ixFEb{l;RjGjBr&?`3 zQ<#Hax|qNh1hLKkq5f9kGk(Jt{EpvMQ~hbx8W&Wf@O%CGm)bvk{`=~3 zO4`q6KNX*}_vu>rG07J7q!;2ydS1RtFQxoF=|@KVE0X;a34c#=v(`pst8b_9ujK71 z`G3`Dd8*wWUyfu8e=Hl(-aTrwWFzq%WUVrvw9|`wBr<=!&1ZI8G_B?u>0})InEkt8 zfgXybUbp*4g#DR`@LnnDlhyOBK{>|9&PlAKj z#j=IDQE_qXIn3xDR4d1Z^nNJ*0iuREE0d0vY(q3MLNGWuF+@Z~K`}uwLpd=yLpVe= zHaIymIYKu#GcZ0NJTx*wFgQ3dL_|eFF+njyIWajyI7BrzI5{&pLN_-vFq1u(R4tY+ zh|LYetU%1h$jt%bvol6kgJ=%MNq-p_rx}3QJd86H7#Qb?G5`Sa><43$zLy*fG%z$a LISM5uMNdWwtJk}4 diff --git a/doc/figs/performance_benchmark_memcopy2.pdf b/doc/figs/performance_benchmark_memcopy2.pdf index 6ad081352faee2b5f2dfc688eb3e33f98d715bea..ecc4473ebb057f1283d37c5e1f1d5c9da103b16d 100644 GIT binary patch delta 36971 zcmV(>K-j;Xy#lGQ0+1sFH#RsRlaC4~f8ASIa~!u3e&1g)k7OwW<33WQhe%dbamtBl z(MhTlJ;)?Yv5J=|+KT^uJ^(!fU>38hW1pnrwY}e9`|ECWHxBXPa^Sab_%=}GUHFn%zODR1}cp1+EeGiRbL;;)yR(I{q&G{YziMpa4<_%P!Gv)ns95T7R^+=Ic;NtZfTuUqPr)89W(jIkS ziVGe-B54=w-lScidy{s7f9_A(l?MBhb_L#_v}mW?)6F-3m3#=$ zTiW6Dmw_+@IXxds&yL@mez$pXIuF=lG2BV{FjBtgUeEivNtH~Ra6pCv zM%BXYrZ|CWj&_aX0^E`uiNJ1_&SjlY+Jp~>U}-foh}=;-=Jg6m!emI|D9tL#2E5=% z>KE;M9EFrwCD_Rce{lekk_SYWO2MIo@3WjhgvsH``5Mm!d_i>LP$+bnIsu;_l&SHs z@IIyls+sB?Ok4PpDRJ1%R7xaN9utsZ$Z7{*s)S-9CLrYAwZJ6|LFxqb{T9m>yC6BK zU$pO%beOwd9g?I))zV@QO9It2?JCU{yyQs?cJoxoV@WG?e-GjyxfUg#g}7KjYwg!^ z?F1rBlZUp6F3`(lOB4!&E>YyNq?VD)9wOUn3?$;>293T_VTC7w2-7B_4$&6ABuW%! z20fmb-EjNiXd2GZZ28?ZCs0jOuhU$>7c_~%ZkFgVl+r0TI$;mN)3~@o$t?mhWS=L2 z2vcXFm)k5`fAEqeF(?FjJWXu4g*hNki@6k=;hj7uP|ee>@mzoxJSPUbd5#tfU=#Nh zJo+j15xaujg2@CosWY+GuGPzgAxI*|w~J(rY59l+ zU_r>WJ|}@D45gRt_NGBC2xV2rqU$b3C^h0s2_cIMe-m~SB22CetAxr(9YV9JyZB(; zr=0sjLY^~JsNIAJlL)C*LS>{5p`z=~m_uk<`Gr1`0=dvD{*PV>wVMb1eUqH#D&CRXdIwiv76uwHSjMSr4 zl-;!rf3mW(v9b@+-!yqQZ2moLZi#tR;^C6C`7s_TxLD}2bTeY*$(8ff4whuW$1EnK zXF+jihlh({qdC@ZX@_GQ7~iu#hy;YDSRk1m&LBNnK{%lthL8MuBTAIPAYDd!ivZ^? zbd|Ks1#V43N@e2qT~J-5W}X|KKRr4tc*VQKf05&5Fc>8w@eWE@+AP{ay1DvdSy%1SJBjaJ7?=nGVP>u$C)@;Ph z^9fOq|M)Q-a8h)n*gy*zoaq`9eTp!an4>L$ExVziLOjbbIfjCYmUR8t!1VAyW{*ez ze}0n=SN-j5>W7-=PiMu0iJ=$Z+3HzA&J=FmW2;)Ck*MyPW#s6qsD ze6;HVE#`y@o;1ZQ|1ZL&d&FP0r^Wy8z^iL>8cwF98{g~6f=$)$@>-(e=cq~ zb5B8I8GzFFgT{06;dabSS0>Vt)5b*W1-lXpL1{k)DU4^kKGta;&1I7yIf9OpYbnG@ zN%JFTsJ>BRG@uZ;DH=jFo{=ljV0EBm6Pp&;&#vVVE%_8T!A1M|J|h$7v7^+WULg@kK*W?_gg#n?GSBWM?}WLq`RSwT8}1YkP_B1pV_IRit`vBe+qvDx7K|V zY62y~-(TF^J^Ybbp55IaOLs)C{)Ir4+(mc0wdtB>*3Yrq_;mCH#yKQG^#%S`TXv;pRX_e zH2LQI;(yy(XmmaS^X~v!e`pL5j9Y4fppgFPk<@tq?(X5SqXxQq%@1OXw1pMyiW+aW z)*1^pFApCm-o_GeC*|_kCB;|s0x6%xs_CeldXxarL#9CjiR%lbTRqL~_8OW+qioO& z35&Kno8cp`p$T-BdO-gad-drS+U3WsSWrEK)-hYNc)_9S#$gdIe{-Qg8->hiwm}9U z=lZ#-y5Hx#95`vVNMSOz1n~+G>XQ7tDeaJ2cp4+b?7x!2Ce#E

g^S^Yi@uQ?Yem zyg;{+K}yd$p9~AmBat43O!3<~W$#kqt8xljYaRCZBx4$v$W@oj10K#JpV^#$U5;{i zAjmzQB2eQhz3Qqmf8^);%59~zrYZiqd|*6sx!L0b0yRGHt3D`0ewq*BQpaY}wR|d5 zV^yc-r*GRa=zjEAE=adljkm7LrR(h*+S9GO*yvh=MZthN%1nS>5|4SQ^NQ_NdqTz1 zQo+Q-SMf;>Szb+#H^QBdiwTEC#!3<+3_+4pUOstsbC-J_e~E>YWIT01`sl^6IJ&yL zh#fMu%gAMnmoD-z_xINq&#&L!US^q444TF(koYG94fFiT6B?Zh!43Gn=fpYrqRQ|Kskdb93seTLX4Vhe>H+-d?6| zA9e}#i5;evV~zooo*MtD@&n_y|M>RBv!7n(j~M@RR>+RO+}&T?-rUoxo3s1hp8k%O z5xr}cod0qA_U7s=5lv6Oe>{H*x;X9%fD+CMEz26BGCU z7&FVmp!^r62;lW^R=fdFNY#~5R2k@|R zf3*at09}C2?m#oZUxWb)#&*EJnK8hT1Jo?7T>h4;I+(k97&`+29|Bt|Q=q-ehliWJ z8PFN?Ecdqo9pK-+0WdQ#|0mqPz5fbiW&bZ{V^dQHJ4a)CFDrWs zfVq_|5TGa_$KdMeN(V5uH~Y)b*w)42f5YF{-Pp?3*yO|DUzHmJB!ra##vcU#P0z*D z*~-z?g~7$j_AiQze}(zzvbepOsDqsy(B9Pr?yvg9tek>WJpef|y1 zt?bRr{~}@L=E$gSZ{_3$lotE9$%hE;kIVw-3SehqV&ddx1pu7@Ku=Rk#=pX=e|b3q z|D|O9OZ>sW&&Sch5n%p70_bOD4*d9m^KmhD2LfE3-GF{R|5W@pf@5X|m|2;+0!)Aw zR`zgzqJM~i=6~al!*{mw1n4q-%pNm<>95a!p7cM)%gn*v*6WY?zs{FYMMOkRM~3!q z#{bcYh&XrxeCXMj0Q4;EOaSJ;e};_n)4j)qs1W^4May=$?Cex2E=KtqW|Ch`E--Q1a<^MM1|GOXwH(T3( zsj2>@|Np3s?W}CQ{%!Cvv~I2+3n1_Cu?F`4+f)Y;tv$e7ZDml1V{dKJX=s%9;f9XC}%hcxM>TvlO z$$zPUADi>vDaGwg9nAh(F&1_XfU&c)u@~IOA%7rtfDiM>E}8*7|24z_Mh1Ha*AExK z2R%Q4xq~y@Unk1J4qz1ie@pZ?;s7v;{6U-mM$tcr8^9?3AH>NFV3hoWKBCC}K_7nd z|3O?#07j)hh!w!7`UkNA7}fruk0|PY&_^vA|3TazF^&HqE&!v+AM{a+iLk!`KM_7k z1^!nCj=%IyZXe_PADfS2%&px2I5Gbv_=le513I`l|0BnTfyF=IM^TplfFI4W`Um_V zVfzpG!O-px{J0Mo|IlOq=%xL~bp1pBQLe)u`X9~?|E2z@&GC==BbejIOT+%Zn$5=i zZ|Q%xn(d=Cjvs68fAFVyY#+QFZQWe{5&whQKZrAbQ2V20|4_Po+=c&f|ES)@*4X7A zyf!JYhmRO;|9~HaJ^nzJ4|C6dz>j!df6{+6+8gNn zcd-9jMN>ED5BjeEx-38T^ndWbUVT8IC(smbdBMSyH`KZ=f3*FvMi|e7et&|0mV93` zjhfzP*}22*83{U>x;iU()A?RFd88L*ZC9M?PH2Vb)#tRO9y+2mQMv86?~7r)%JhC4 z+|m^0;AFw6@Q(okSbTalp#$GnCtr;q8}Js;E*bJ~PHtRCO5Z;{dh|pr&J$-MV184yZ|H~48Uv9`1cf8EfTL=mI9%`p0tYb~n{kl?7( zTx7ldUZcm{Iu(C~d*;VGU=C5+@-(oy$LDvw)B4EhApJvlYU|@j)97xGpJuV(fFQnw z2Q!VhrEb*PDo-XsDrA}KI9RtPP#PVDZpA&B>jly{jL#?dTI3m*d8dGhNrwxv4|ELw zg8iHCe|4;1Wy6(1=hhtRqOAv-e#m}_IzdH zy|ZZg@uP;IVUduqe(rm-qn-QmauEZJV+W0=f2&H5NRg4)Z8gr&yw9B~fw|o-is_QV z(O^XnyYqdpCZy+TBJRX?O>xMYYW&Z;N*r3sX!32P}XC~vkq3lNew$jnl!H8a7FM`)gHf-dT)XBEzXSZ691gu zeu(0|vVTbjw~ePDUfd1LxZUKZ`V#eK9@V^|?jc;if?u0CHwfH38yWkMI2IFkYtlu~AG$Q9C}zb_=_Snqge3q2FXHS<@w=+$kY-|>Z&^XqkMgOY8E z2}Y`)z)l^!a3tr>cuM9e+YgoFWkliZe_2(NFC^sgHsi{@N`JpeFue z9;OuxV|s)d4Fj^vH&2N^To;#et@25URXmS4l%A&+l@fL9ZAutP2DzF6a=E`wRAvH1 z#MKVr9BGioRNqwh>HLD!JYM2$G-K%~lEk8g2Z;oGQk=)NY9yI&fAxEi zgEHU5;Y2Dt{IrpOBVRTlGgGNAO!_-t+p*ncebduY*2akC)yB8b&C26In1+U$noPkF z93#P7&XiM3c*C)8kh;m8w_Kh|d~D3~1h@RIlaEp$E69kv9O*61e$KLP%*U+1Gt@;v zAgwYm>!n;P>!xS@?!B-H7vB6$e+kL;Hk~d5^;%mG*c92^iFGp#AEaBq*IfRne!&<# zFpb7#+9W8yk(YUr&(&tC7>jfF;n>7ZTF`s!1AsZ_I*nwIu;TDPfG1_E&1lOqtjS`Z zj8ON`VM&h4`WQ!w>WE0@biff3c&$<)Mafcz_pvB4&1$4paY!qXk=gz}e}1lLrccD$ z((T4Rodb@KL_ZGU!r;h`EmE+MTO^rOdq+ITjNzoyxjFdDSFrSBdvCZUq<@Q;Tx*I#$|LeCsl-j9BN8&*{ z**7xELK%|u4?yS{AO4ss=NmOLIp4Pk0k5jn;KzB0@U_D^M^Il}py581Z7RX#tv z;i*Vlm$V@5C7=>We_*0P(m+DFd{mYk5>R{k)v&1c3^lL7OoZP8WPKgv;SE16%%*Ch zZoc8icOZK0$sfIQ1eSlRHRyOScFiGz5mID6&11|4k+>#GYGm~yBu(EnMV6OeoRS>S zd0p29x2t$>^`8795dZ~?W?S}DPSPtQPz8io(h!N$6v6j9f7GD<3bCd-ylRC*mvf;!OHM!&He}g*Z%!D*$dqXj3)p7E)Lne-a;k-WUWn-{GmCCG`M;aQuOL^_%k+VpM zXjO^~iaA?WbnRSRs%H~t%f9h?&J{hwqGh@|^`2)(icbqf|f18WP3IPc&1~%v4UCd%L?UB6W zdtjg|^A8Jn7VzbAym9nnQLoCJ;iC$_mxZ`1h;>HGOLB6PNy}NiQ6vg4S>SEk+j4k; zM##X%Nhmhl@X5mNpDK+PD8>p4V~@IrCkAZH*nX!tYEQ!ZzF;<5<8ouTb)&u5T_$fZKDa!oZ?^yyV@1 zPoba6z+JkFxcM3+;}Au~jlwjJzguo~jP#RZC?`%x6gnt%^zTfgh6Iq6eeEEbW@T-E zP~u+XKGTVD;f?^MG}^5sL_Bj|(R|`je@H4k8+Mt!8%k;-5`r=hLC7*7&|D9rU}5t> zE4B58KOmc9(Y*Xr=dm6(c&5m0@}9p>qF9vsX>?q(k46Gbf1R<&$gAv&axHA+NRpm% zaxb0)P1%~28s>G(k%0o>-pnvx-YBFXT$NnyheE3?XIwQ5txEdBnXYqT=F715f8D^R zb8RP}UwiEmK0;yi!;*G{((^C42?k)^&+)-?!#>L?3MAWJWA zxsvDay{BY#>y}V+_EU7ee6yUe&qk=lIo76}TQ%-&RO`E@)N&4(+e~M)H?&U%myyfE zgomL@5rV5b{PuGy1Fo_`fxt&rf1+V?^~%5|(`2L9{=^^i+(@oEUms@GxIjG*v}M5O zB@)ej=TdHSt@YqLTjDR~G_rtC&UV7@7cR-@y@R(m_Bxy3H0f>pr4OSncUKOX=q5rw z@m>q?FvOywMA1H2rZ5be0XmImmxKpQSoZK+CXQhiE=p-;Wc1T|_tg_~f0-0=e2!fZ zXwAsBpAdXo{4?vhj-DAt`coQd%_oMuy-M6eh_(&uM7*m;BXYxku?V`)wZlMxo!qwH zEk^Wdfg0x%bK%Sv*ukZ&MqpcbEE7Y+!8~&AOTcOid8z9AUx(#_rB^=LI*2SBS0us` zrDVGkGd&$OMM)$DBdl*ee_xb}ZyM`H2Ae-Bd9npcg?+>DJ%QGIspnmlBwRp8(f&lj zD)Nhc`S#`Daa&4{WVT!v>XA8RBK_QyUEz_t%u#@aQ>IPi^|rx<5&R;m(Sqt%sV9aq z9g?enZq*Lgc%r1sKywfekqQ~NJ_kBQ=MTegi`v;db?oxR)exHp7pv%(QP7`a+bWS8@B&JHGg=pfZ4E5v5&LZzC0(3~ zQE1`^4wvmMC~LKlK}l^EA0m&rN*7eN!v+k`Gc0T4UkfeSW0X6mpcwu-vtJA9g+c68 zeA#ssR)D0JL$ptce@+#IG~B>6FbrF@Ku>7!&%qpwaVG!kfeQ)d|mjY0oXlf7Mn_lsQqWAHY72n+oR? zzu50IQy3(;7d_?8EUHS|ji0Z?+~-dS905~h8HZ;rmr0Rd84XRgrMjMNAcG-n1&%21 zGuk;Su*-6SYQ6%ae5w0Zmo|)}q*5xkbXsPs+1z5({kjJF5Bq+VK%uJxrip*1XeXbY zfIJb;fl9;7e`)WR%;w_Ak{_8CidQ$E5&q8P7g-*Y*2fD}g<^gHklxO=qFN%2=_-fI zS(a(j?+5#Z)Z75S zQkzfjNqA=(H4QZhe&3Zg1rbt}qDbL;hip%{fB?GNgK&Kc%9aSm*W1;oEn^+HHLesU z_VQ}q<++}vMb9xM&iA~6ObIrblz&_Jyv=Ct$GVx4dHSp zNJ~@wf7S2hzM-jEyT2jE06uVTk=%NGh*+xgGFnmU9cE-amdNm_F`D-%qHf|QAbv-t z**y=bE=Yfk6pm~ex(s=hQq)o~h&`PzR2BP`pzZu;P0zJZVkLGqY=&_~eIPx0x^RGL zRQR#2gC=ObJk?Dclru7w zex^=+wCp`z{=~2A`t#Y(c#qv@Na}Qzpq;MYs1K}oq;9@-Du(s1QdJoO6@!imBs4nT zf1IYEFa;{yj<)*gC3zyZulv12a>6pDbo9vM)X&|Qx>m!K%*R2 z-R%`?8(X&ZdLhE*OvL<-pnbO>`lE8df9|-!ps`xgjonZ(6Z)}XnYrjUvMb}4Irz7& zn6aHJxGlENwak_Fk%qs;1pFMKXb;TGEVwPn2%bv@K=p+#<)4g}3}#v+W$$^}%b^N9 zhECfc_N1MiE`>{2&4q(EwNb6yDgt}g_xf%+rZ3NC(6pUVjtRu>?s?*3R?G;Ye^m;> z*8z#buBZs7^krY^(5ZJ?jmru~S}k`bxpn7?-QHnT53#vr6L3ON`_*JijkU=3JS;{M z8x=!zcfdTO@tlT2^1UFWsZ!qCtx?$Jip!+CO&k1tEeX5H;=RRoKB-Uke)VY6j-l;% ztRGblEEUZNy)n^PEtZ?96Fy?qo z|28td%0nv}MvT&}o`^y*n&})9&&lug!0mxNGbY4C$n!q1x{)&u#X|G59<4bX5&E4U zweL3&;E8;2so4k#oxX1$;S!t1WKVe|-^+$LpOh}mG6gZ=|2%t>&a!mDDbxpx3t!;1oQmTtZDF__sQ4aw4auXw8YPfQ&5H(PPU1= zr}{`}WL8x`5(dmAtNM!f z;Pr^xo8_}Fqm*qzd5$tbn>*GMkp;1~76tY;7Ef2u2sPf05*n;AFuiIpcOoDt(owBESyHp7#MdH{LG3VZna+{c`j5s-L0? zc>>ML=Iz>xdxcpq;})(|;aU3mapD(rf0#8bCBDdH>8<_1B&@{0!8AQv!47Dp<{e>^>yDSYVs7KKsOJze#NGbZESOD9X@3krKvA2yk+vGMh57F^m(rgCV#?%D>Q zdXE+lK1A@!s-P+DJ^5BG?72eQnusbpclcvS&kxX;vK*4Wh5oD`7DsBw?dy96CZ(DK zFi@*hbu?svFCI`8k4MxPQC|{w!wk?Bh@C6@f3&>|`3l1+u$)tQh0UZz@r)QFgZ$v= ztpT7M-g-Z6Z;;1cqq1b#aB0SaWa)7h*EI`O_m!f@tAxuWDMR`(``4baeoW1biC?lT zO-U(n_Fz>Lo@?)Ej?mXhjG&3j$1mz<42yi;$t6Pr(Z6OP=TmMu*zrFAw=~!$+&vyU zf2#?|A0k~D@-ZuN^q3AM0I_ z6d0GrL27nSRH3>oQ1FP9kWUAVAoJ2Qk{g-}==a+`g+_#UUJug5%tRKs@&W}|e@&PR zY%_Inp|$P7Y%%{fVoHw?2hHsmC+nA{v(){)lfVP1VlqP+6~ zWBfWByrNSRpthu@(XGMa%B;D0{syB(hW*5|cYSG&nXv0B+L!}UEbX^rbDWY;GwwpHo)+pKG-t4q(`U`JUw=bfyV{q2S2 zSl^&GMKW4#ym-lTW`HNcb;s`CPPagE?w^LcO#UrRZ+Mw{H2grUj?TMwxtWG0TUP8Q z2AYp5$w%!U1waZXLlKvKe_|Kasl<}{JcW6A0~@=D9(p7PTHP<!$*8R}!j`NuV;8c%f+m_6h+K2+bW$-*CaJ;kWLatiDV1P53TmpX|3JKnKID_`Os1l>8Gnc60AQ?jU79l@|NO?aylniWG)60HS$MS70bjc0X4w$Xkn zRLy?1XtJr%vcCRIWaH=>QnXpcLra_v23HAR3$ zUaax}Q6$Ar1`*_3f7WomFBCevuziNzUJMe5B+&{LGMECesX~EYdDT)cNxC@1%baW+ z+PxDLB4S52CVw~!Rqc}wKdk0unW6%kAyepiK|PQ-XjcpUik!Y%qdRnzW6yFNrix9O zsgAC1WZ{MM5=kWMj;lSB_$Xoj9`kYdwI;O@3K3#N-Z_=bQ10sv^ zZG-dj6Uv-Q9;zE`m^`K?FRm=THvwse)J=-Db0JyO>>A!?;oNi75BNeJVP1oDaZqJs zQHZYhKB`{|F+|>FTYyYp@g#$N;k@YST9l{eif=nZt+rl%z!OWP9XCG;=r?^iuP#SH z4*^XbS1!BLfA)hab8W$cl6p3#W}i^2{I@VO@rvRIxQYyx-yfGlUm%~UKK>y@?}kMA zuS!U3TN&N71G)(VH4L}p_W(`fP{$EA$j=aV)SvBRu%R&!NP@3%pz&7xNKDSWZ*fO^ z+@}dnIY3Jj8_jSG+^&l4Bk55IPo|`Alvb%g>K*Tsf8&rpg=tu6p?my(H69vID(G8) z;uborn+_o{4Y_A0gX6Q;2=E3rb!+?c5TGYwrjJ$Tyhxl7lNA0&HdE}CaN{7@5>nbd zq0GENczJQ;jQS+Fs#2y{eOO}2N z%YhQ>;k-c4#~sO5?$(ER7SyI375HVm$(e!LJ>kVXzCYB5m76kSyFgr$lc`j$C0=cq zGSgRlc#KPK;o4Z`=Rw*@#gFA7)Q!QA4S ze3;jzLp&B*A1o95ata;!1a}Z^rkKIWYu2B@?oidjio}1g)7*c> zsid#Wg}4ay6>nyKBa@z=;k%!9Poc2Y>)p=TgML@~HBiZmW@@f=)}O`KcyT)vRY_IKzsI3-2Km5o7J1@aSHW}Squ!^cf`>oX--las ziLkdevaG< z?mfd!tqxFo#v-*lrtY!0m$U`de|D8X<*V+J;Q4rkhJh;`nn`AuC#)uz1`stOc&yjs zLWnmPe{O(GsBcgZEwAeR5poPE8V*uh;=M}<>El%CHt=!+Ol0z~t5!$SIe=P>Vk9@(= z_8^5N#C#PATj$?Az}q zHQwdNpFQ;{IbHo)i+MXmfBI-;k$I1@ z;H6J}e}7q>tWrtfyN{E`f8%9n5%LJmOjEa(_m#&PGk=0N;+J`ECC$Bm{28q*-q8>Q zj=Xbmrb-*$NZWJb%>3E+z}3>skwiJtGmT+61>A5~7jEQ8ou5Ks`7u7oo~g-b`y=0) z!ck@xF;I*QId9@v#&k&e6xN&uA8P1u?^6WPq^2TfDi^ULxuF$Zf1X~AVK`SEy}8VP zLJ6MS&6*Rb*c!`eEmXQ&m+emXz8Ddm2=;;v4|!HZEFjd0b;vEjZGElw{e15EKi2k6mapmH|^`z%LKJy4N~2!;l;g1ZMtx1~9Rox~bPiDhrB%0+Pb5fTt;lxb)p=x{o0&nf% z6ki9j#M`u^8-2xxXk6frJz3uGoA)CU{U-@gBp4e98aqAweBlKJv%$U^Pc{rD5H z{5Z|85S=AN+c12N!9bd8U1y3Q?K3||0?qSkdXnEZTTmJ~(Mxw;8f@14?o7oEMkLtp zpN%|*=N-9eM>TGR49Wzi{L67U>f|D9Rt};zifPjAh@}z3>317Na+^+_NQkF~W%dev zR=KYjEf0M0e-RvurBX}0@07rQ9byQ7j(2KD8!n*!Nx<^+4$LGP6|UVY(CVgPf_6Qo z+^g6}A=lB@8VN0fVQv#qP;^#f^_*FQt%Sk{f`2+6R(W8|9$XQRh11FgKYSK>ZM9@PZ5!uN)6xQHPIpS)_$-nA#VquU>HHN?SSdfaksPYG$MnO4wE6Fs1( z%+cH$e{E`3Go1r)kL%~qBPy*+>Atuh_w2JOLlKhwSO?~Jy-L7Qx7MXx$J;7ASJ+GF zv?5)DYXqeIE^KWWYv@UFA$+({VQcY}V*Dx$O0yUQWy>j^ptNKZLgP75BG?K>H_9C3 z%)yH%bGdm*3dIE>2|D=&y^fj6*okvJZ3ooaf29>(JHtG?Toa`DZoSgP9Q!u&SI;&5 zfbUk$^m?O$ENP9+IG$ILP_uYh`o+zpV0XQgjuybN4cY-KcW7@JAzIQcX zd(4$b&Bs%<-fuo@SRz@i;66i+fvq%p@1eF?{L~Fanu);92<&r~A`9?)QFbLCs>W&_ ze{xDoVXdcH)_D&jIdt$`PgjcXm&1v9Jh8eM%A8blp0z4oIH#64b)-b>C{COpliNEI;?Iwze1Dn)0M=U#OvX0LBcli>4_%~D}t1L-hT6w z`O2`-$p5x$UV;#y1DtPI7Bpw5{7D*U(ItBLq zh144ER^w3}#feB!+C(C}5TqT~yr-h+3yD(9mxAe#rQZ7TG&_?F0uM z`%VggJSNh0ZoiYGCq)f!IhrM1QmV>5nTg3@iCVEbz)vPnYV2vjRc$qisM`ZIhe|0F4LkgOs-adN$LPj>m_OUsMCiyKx&_!sJM&g86 zNQ6!xbnGGi$xhs&A(P9$BIoeP!-a{zT!# zIb2ck0QJ6?97W3ihqGCb4U5Qh_o|nvJRUd@iDD9FAOQxIZX3ZYNc3l9f1L&S9XDop zAbkZauohzd)ja--PF-X5YYE(m!5cZmDXInrDQmPKoXBi>q;-5M4ZEuw#k-UIXkI|W zsp-?s0LZs31lCWSDn4YqN!WuvY8dAy-FnxO(9=lWwT)t6Ahzux@W=R~TVYshcu-TR z)C{evru8I+%a5^O@~*-Qe`VLx5Z5f0&>c&1CQaSWR}z{)`(CB z*9(KH(UJzf(K^+RTQw+1BKd%e_`5gptdXc_rv)B5O&7$vao-*G+gc1%JDaz`Fa50zMvC3g5?fc5p#_T{l%DjuHgOsphf0+q zeO(8^cbT+eSdkjwp1R+ji^^DDHdrQo8xZaZ#Qcjmsbb6ge`S*>b>1$_d3jM5Gr!!f z>C%R&vgtM1CI)Jo!O7|Pj8Q@vMvosPtR(K*(h{b5EMiE(XYAWgiiP@@-3?Mr0b4J> z!|7ocCtueucxm+scl9u?%ZQH}K$RUC$K~D_E3)GwDu#&ShwOzEv?}ckt?K|So_V6X(Je`#U%$cY3Sh#)e zh)#O|;bt>=dyX`P*Re(VwFwp_GnyvZgAPtb-*yS~e};{=pJJjJ!v;o9TG}#yP>I7q zAlx9p&Uy6KMzt@c4i48PTM3+$^0(L`qNlZ>!k6Z6i5eyQX= z!9%iTfANzjM)xJ$yGm1se5ZWp*^0Z;9gBS97)UL7_M z1%NCmfj?IZw*{k*wYK}?&UoOxiSS6MS9_z8?NsEHrM21Nm^1aKbt$X|2oAC=Y@?yf z;$3vDLyUmzu1*xE^|7OY#U=jf32#Y`*3OY~{)k(hAA@J^dGO%--u#wJY+Lnoo={|y ze^7WX)U`+fE%??M2NL{-6q)hP(0a8{_^4Vo%JGdeGhvzR9W4VyzTowcn^>ZApMM^| z&i+y{Q&0;Q=pYpaFHg=ax~OF%#artmzGu}S0+An~7|TKAiNkWR<`2Qs+}j+xr)`|e z;7kY)9{J_|c#!<-qEU*fx%!O_d7?dwf3J{TcmQFR@;NiDug0HcNwUxxIZj0*}s;IOI2VT7fD|{5c$NoiRKk&Lf19Z7W!ae=Cbv z{=V3|vFa)ts(o@cDEzU<=QqoCiymxFxUu2b!R$>1h)a-c+R&~bg6z)$2V=R!3+NRA zT&4v~cP)XdGDy2ao@hEei7FfQtNE-Igr;LX(kHWB4ug6JW1loJ;|`B()V{9Z`iMT!xYqzk7qkND`wlRowJwPY2l>STB$tbk5wIzTsclsK0he<`g|i!2I-VMpu#kiiGMmSAn(U1KH7?S;=^h^V> zqFb+Y1?Y@D<`bkP_(kgY1jZR_>7FCb3q)7TjDQ-sTMaE&oYUyol%*0>*dKjqmSq;% znt8GPT77ka&aNmC z55ik8kY!evzNRZtTyexKDXQ$D>zRXbaJC}-x-{@75tS3S`16L|!=%#q7sFF|=L*uN zNXjUmcg%Xb5Kg40i0!wv@f1roupgsr6MB_b=LDKF>L{6}f6lRE$JrB86CT+4V5{W# z^G8YIo{NWdv-h)-5#28KkL$?=P52Ht>tz@^z{?MpxA!_MUTK^ z3bJWcCdd;kLEy>}AqgQmwe?FHPoD`uLf&x~`burys`yn$@wtD5q(9DfAdH)&x7ZpV zOb(7=whSgbe}%UKk8jq@7NIr!=Zzj!{|J9HVXm*P1aznlEdEBI`U7K?crc{{w;BW( zm5HM=DcpdvPDYpGm7W|Wp|Y(&o4h{rN`Ro=T@Zr6R;Nveo@g$Y7(x=Y3HvJbB;0)* zY$w43U{|9akybm_?DOrq{nPWE+d(VIg1LxzCH(Xmf0k3aMNBf{+15yd_Hi(Y8><`u zTgN9XAG|fqkX6g%Ea|J?BhJXNm*gko3p0GMObQB^n-)}jnJN(?IvF*oPcgw_ed{8_ zxwR*N0(LJrWom2Q$J85VPfk}NKqU>KcmngxZ}X`qsfH$`(YV(hW-+-5!+@EIcg+@I z2&&hOe{Hz=#xPs;SC*ORK2yMg1ez(s_O-xnsQj_lrHsZzbt21#qb%~N$)$!OOLn5PAJ}V-Q}D#IYTwQHdJE`CMD}(Ym=pXaBAcQX7EkQ&jD@3 zS_>PJyQ`{)b#X;!@3bM_b!_?w{}a~j%e%Zze-uUhk%GNuJt{}`+LSb^o%qOUPPK7; zB3!WKJpZ~n)FF`A(YQL~b?A*x-$bXYga~bOPEyDyMLJ3w10$0Ht;;&{akDzM^omQ$ zVu{wCz|Fg2rU>lko!{-bgbe559IljStMNl8isx8;egw-u$$w^71l0lwsd5b~r&Cnj zV{9N{-!Aal+}gIcw%x7Ut!>-wY;9|6+qSp1Z5vzj?DL%WP0pEQGRY+KZ9d$&?(6@% zm1Y5A*DFb3^=M6Oess3&7IUd}iZHduXE8lzpM*rt7{^g8>a%WA!6#_ZN(3M?NT zW72k%bi{8c-r+$$sM6mcXfKpC6QT=pg1sz71_)shm&{{IUw$+l;95bkQP)in;-u>e zS)tjVc(0)shyq&BGuzc6tUQ^*br^m(W6S{3re_*&c0gp*ieeZfG?ol>##c<%`zkB% zS7b_MV8VFY&9AW+rCa39AZ0gxi*B@dhknL~J%iL@6*kcu7mGRFlj}|MfZvnYtR1pJ zcNJdQ*ygYM9*215g3U5N7y@y+IBE-ZK@*pu-IF2r@aj$q%GtkAqCP|`6nBC#2)|IEjrOUdw$!TEY2xp62p1O;$FAJiDE~Sp9vCV8E^r*C3V`pn(8~2&vZ06)Bp}rFfna%Chi`@M{!u!o5bBtTbRHGDCF=aDfKAI)} zqrgCw+QI9F--YCoIWL)>IctBu9*DuR`=)-``xg1UCI>U>lj{wtg6}8%0M`NBe(u9) zU*oinZdWc=liunGkYo03WUJrDD1`kScla8&VR$U+|Fa%|wUxHsIJwR?qf3eZ(&oi(aoKtf%t5s=ZUYw z-SK5JNhCw)4nAfkYejmV7>Wk85$G0G;oE>Uso!zrB!4jQNXE|S#P(S`>d)7AUGQcM z8ye|6U0V&WqNJKWZ<6KE)F*il;Ub?7lRw#tYc-7ihRkmskEvpFya8MJdSIC`6PdIR-FKX2!>OH7_JN@{OJXZD+|&`?4nph zt7iPvWud3K^y|20&MB+u4-8Iap{Y66>f=Woo*T9EQ)-NEKf~tjT=S2qfs|`uQpFt) z(5ZYUr90x0yF%R@6OBonErXK{jO*)D9C#yFVb{~wJF^~(*DJu|6IyNZuSQcFyY_AF zpgQ;o7Oq>rI3ri{?*{_HX)JG-BBi#^Oz4(IbOkid`C%2zm4^~sCnQ$%Gfcb1~ zYs&JiIBk38c*&Bz3NA6<0)0JMDrUOV^a!1M#qR*gU!QLDGqVTaN{8C6P006jpvJ0p z;o(hZ3SG%32qM7`@}d)Z^YjOs<%?(b6mMtEQ0e|H#2tmIBFxGZw~~yCT)?3Ac9Z>< z{pJg16#LOV8Fwu5{8ynkvNBAHn0htU6hjA`8d|+bGy@(R*Kf)6l1-AO2P)@XfU?i- zfo8w!y8eOd=w=kaXNeTZNw0c~PYk1PezJFDzJ3@N?S%!`zkhrC5X6fKG0~R^2W3~%_!>T`Dsn0dC?F>VLQEdO z#{9x6wf6_Hl|J$#e6>StG}3k5^twAcIMp>HoA3A6G_zTw>k^0$lWghUTu&ize3)!( zN^X6~A?&BPahj#Fa97QztLE2jxq>1k=22{!6qxQ01Ffk*u9w%rP&!FaLAA04lm5%} zsw++MwsZxA6&&L9iRRxxO*-8iujkrseE+%482JFcY229**0_QYe%m2eYstMX*Kk2J zD!m{T13WAi8Xw}Cz?>|bSXXzk%LW8p$TecO*q!!w_Y?|Y1ioiNBejn|K@gV_=PP~G3whp71QB`g*nmU#ylMD zdi3h-k4+_s!^OkMR%?d$!7)mdMtOtAjZXvanXFH@=6?q))^2K;aO1fv@vnMMaaOUI%C4a&$SI|lBB#RG6m9}!BfjB|XJ`4;FFOZA|noUDP&OoXuLaf)+T8jf7Y!5u*mI(**C5q8&# z*SBuZP_)tSs{^0U^!h#pdh_+60<=2;nVJ_EtA2W}=@CyZVmI?WoGWbFWvs6A{rbkm z0*v=_dm38jIPCQH#;0>sRBnh$n4F0Y81d2ftfny%3YIg^j~>`HJPP{Q!>H|Y1POqb z%@5#v!?%hx8<>T-BwyN6?PWw|VFMf!N6#6*?D}vvr&E(Kl89Zy@t`KUF-A?t>gh04 zeZ#Y0bTj?3VHHOF zm=B+`0~b72BuUUVnwaX*WTgqSOADSgXZg)5F)%k}lZ$m^a}FrKJOgZT3Rd=GvoB@n z!rxZn6^*k5#`Mj>x?~}E)HQ2~&zVMa^VXZaMzL4M`tA#zW243=er^rd7F8F0xz_{K zkoIw6gM2R4$USHObt8U-vx>XOYHOZTv^D6MKFqp@#i6F;MYSvOD^(kZ=@7*` z6BuHK(#~^OqMpU=zD|)o_`cW}?R9}R@CZ-yOT$UEm;U8#Zt%?`3FjoH2NJt$9L3LSXMVUU)4chF&o4P*Im^c2oVCtwvnmydy)g$p~P zZFEEA*7HsKhr7w76_L4|A>MeCg+yio6Act)^rb~k#^5ryB>>DpDK`fWbZ*Rz|H~3L zS*VPK*SMG2UsEB5KmJ7qte47LqXW!Rb{pRr^I;y6 zXw|=zgIlf%D8jPq(m2$$oTWene}@bIk@u0ZZlEyexZWkYaC%SbZy+_kiM0PEi6Vxf zf<2%IDM245aRd|xAg3ssJcL&L31(%4)D`mIfpGQWL7*+X_jMxCEo5=0VO;rSUHRLjSgkMjwUx$s`7^-KF>#nA8dPs?gEdi2mo3L|R& z4tPHbj#y8FESrwGG5!o^i_Ri587ad4zPu@4SY8DeEEDi*B>TDp6OvB}BO;I{g+3CM zmqNVAfB}~lY5126s$hA%WugO7Mv8`ZfD~~%fLkf=u0mcl&PR>#_k!zj(mNc4Wwl#| z%H9Mg2iuADnvc3WytJ*tFW!s{@^z2k#UH861pBN2=?3ti={b1`(K8$QvVaN$_ z(v=8DjrFfKIkE=dKRcjll-CWUkSp`LoKGgj?VlkD|%L|P__(jxyP)KFL zDe8qwPY3(*XkxVr2CZXA2h2{}kBb)PxhX%hD;?mx_$F#K1+8+Z#3zqdQ}Fb5_|zoT zLGN%ODMM+5s!&;TswTsS{8+gQhVg2?GSJd&9tWYErKgK=<-1oX4cxtldc+!3_{fRf?ne8HEP8YFX_BV{#xTY;AShJrfdg#*@$|fN7T)r7!ETBYN$b=Aa&v zyQdVU-ym}N{uVs9O^HQn8m%Q09+_EGC^GHAT3osq@BMUtu5IEE+fUJu3wh*4eoT*G z7rEkutPS@G z*sQ+rQ#ssaUmLz!C=<%asZmk{`bJ`)?ggzWz6`6`yT>=|unC!vjMDGjESAm$Tw=Az zJ!gxf_^6B~ze&M_D_&XoT$@=U3&0Tc9iEz`lA` z#F@$nbrxffs}G2m>0$81b)kUm!?OWgoVO8O^W;9we1Y-N4s9jT?@EGbcpNc(-_6YLl%zt}v^r zU74$&NcQP%-^voWh^BRKeSqaYGRVnd-l41QzdL@7O9ft5!H2MG*~-AhI39jPG86o& zRli@>wHChyl!XW?x`=?DlwchG!P(D6F(A(3PHw=fPiLa=NL^VM(`btS?or1+e)&Y- zxWP&}Wp(cFha9fD6Qw$R{r$$WS4W51P10(iAe1Co0-YAUt?xzXbo-c{8JZ1mcXL@P zMNO?1XxH-Oky!A(0v+JCQFYfu!Mvji{muxw6Bsmmzi?_QA$V`pLu!88@CZZy@C$9mjYk1Sb@7^ip)2-q9Nt&0s{>Q6&=QoX9w^iLb>k{=lMO z5>Yc#rAVkwX5+FaEcMSKBB+p^8rVbFE$elQu)q;@^dukl-l0j<^->M+U=M$?v$XqoFOT zc#pQ6`A}^QbhN(M*Bq(n>V(j6`3rTvx{}CSSR8ND09AL`;%5PRJ#bXZVsk|SfWjeF z6`N>8ksK_$06>y>E*Ghi6A+jY=-O2wZDUqfrQMdUKmL&r z>@kWiL4%*le{Sm{x{`NPk!|NGN%Et_EX%+SVae8tW}W*dG7eVT3Jc6ba9nLox*g~e zViD?2ULHrK8<=*XzUm%wRKSfj8v(XvhLhpqMp&|lP27A1GDIDg3Ud@yxq=?Dm~c+c zIDTLgSTISGt4)qj3F$YpI{aKqzCljDXBmSvTp;&Hjs)?c58AKM?)f!C8PSn-f!4tn zz$;pdEs2Osq4}F^P^`IchHm`~{L5tM+-Kn6wHFGHVP_0(?ht;i!||)1DR}X=jA)ue zx+BeMEnVao3vYX#@s?ME7Nr2?JUbqAHG>BmJo+G}vB!GZKn^o#u=eyH>3s>740n$dl-R-a^z$(uY}(2yLnK+)4ZtjmBXnwTak=>#hC0MNZKzQ`xW606Y*0v zn53UAc6#@irW^9P0bOUuG1<}BgjZDGdmZFb!#9?B`?h!W1xlJS{NG&i!86C2kBFzX z{N)zixzaYhhjkLWt3pe0KXSgh3M0wn{DH69)3ZvGWAz=PT1{KddviM~v$*F#uovk` z_6Q5IC3$RQYvF8MDear*b^pi&a(3*}hYe>}0&{O6Etv5;^8$`~(GiChaT3upQ(bU= z^WB>9U>jTRXkbkm_Y6n(asAf~*c%!N^kcXfrF&)^IG}(UKVg$LQ%^+nA%ggG9H6Qx zpnjVbwlPTyg@%Yz9w_b2qWd;lleC%1MQ7zOTND2Z@o%>85zis6D9ndx5ue!-&71yN ztf@02wUH^NI*9x?he+#Ax*T7L2nR?fXHH}Uapa-m<8v~?xqrr5rw#( z*IB#M#<;sA1N6Ury>f2PfD68dCosC&kLe&iY?wHTn@!RcCbx5^n!WzP5k!KNro$e;?8g@K;wT zV8Q{~zjj_vx_vv|{JsQ-ybJzqKsm4etL(Qu(l{AWsCgN(LZjsd1$NQ#vx1``3#sb; zh)#p;{qtw+-wh@yUu|}11^-?{5TgTra$$66F#Wd%M#((4?mYCz#L5FernuEdNW5lfb1&IC9Lt6R%=@DWxJ6Z{K*Mu&rZq!_&^b_~#-nI6k+~(S$1m8FHw6%aotnx2l2W=?H5Y<# znyX)FHa1pg2FDgQ*QTcSpv*1xkYMN(2iQ9Nx~CvRk7fV_F0kIt<#T}7fv3HSo8bfT zF1?9OOj3!!{CRRy^Qj}3RfVssi>*^>^K}w`8~dGTo!-u#^31>-y3yU$|1B`*cQWhD zz43Mz_7{&1o;NhIcg`27Go3fYxsx14;| zhZvqfU~c&C3i%s^q4Hw)v+UClD*X!@$n0rIrV>g$74@3{DHap}1Wy6#iT{#PIX8l_ zzm+6Z#ml=>%1RaOEZZtC(N(45e{*FeR8NU6a+{j|BqBP5Wkuegtu zqOWgmU$UU8Kr}S_4q#2g<#U1wllv-puKXO%T-90yPsZ1&{A|?(xu1}e`x<^?(JAI2Y6%?= zCI4~(6*M~rexkQ+wv5M3~QhS(5g(s$uA2r_m9qyjeTz`xffpSo9p(VK!d#*f%v z$!{!|QAl5^Utu2}MDODteS^0|PtX{W+MKDt%PL6J)V~kpJ}*DG1eSYthkaPuwRph4 zbU%%;Ha{T+e|LO=YYLQleZu=4cz?itNg5a1O-&JDUFIzJhJt)WeswPT3n}E%Y_woN zt86Kr56u8qq0{{yg!gZ(?~w;@>1hl10^6BA?ffLdB}$wJQ1;y0uqDyRxi;_UCG8km zR9+9{Aj*8$oY1Gs4n{1|q)l)nZ`exmW*_`2RL!qkIagIp$) zFu6GGizw4d05{kP_cplIp8^=&^))^-y(I8o%3XjEQ3+uMmTH}to=HMWl<5CzPN(6W zF5WF+>dbHQH!Ov$yD$ z7@!MPpd9YJ@+fCpMt7i80j|{ zk0dfiidU?I5{W|i&_5?Zf9BYixyoa#V&w>*ud)t{PlGEg){wF9VcAJJ%oJH_j!>Tz ztyX0o>`S-k&#c}X;I($h2^l;9uVt>oj>{AaJ*-^11f6wRSU*lpoJIw_y*+kXJXVXR zQe+!4a+^>!Gbppp4VD5(OvUGm959orqkfM2?RpYpB)r(EGqG(`I70C-5IK5m+iU3F z+l0ti39mo9jBUG-JK!SL6((XC;?HcQ@~2{bs;N5VN3F7r7 z(vic&+~^hzZ&~A=xXMwI5vtFb`VCxHZiX#%^3RKYtjh5Ywv1+XxwtT!H-@%N`RASZyq=-4FHj_G=7(HR9zk>Gc znO6VdMW0Nqr0!@%F_!(rh@q$&nOfEi2OJ~#y@|W+V!o#X+~QVCmZ_5aZB(HuXp6Ph zZFX3(%u)E5jA{{E_$ZT9u*`l!8Xss=xG<1+#mGU_vsee{F-0<&kp7qr5zt#EF)O0V zl}yWycMwv$_rnEqj8OQ+Ko%G_-?R#voYEN?(=*Xqm|tR)uP(k#oPKPsyqRR2gILzgjje#ZZYV5JNT#YE2C-BU=(m1hf1I z+c@#7d6WRMpUe2g{wC*_M<@PH`bey}JC<>dWWCcH&NmmRpd%F*p_R!g4>}x@CXY?$ zTm)A$K%z)pd0RZ-+pav zR84_~R+)?9|b3xdD`e;l0$4GvTP-!KrC5S^x zhl$o0Xvcjzt%AtX#eBbco3;G~ff<-+dPIEC#i7^t&|q8Z=vp+qz!~i2I(!OO(B7W0 zY2$gtr05-j{T&mYlJHLl4vfksqo^>N6@1yh?R+EO9m>9azI6FNub}E=;)J?164#PN zYA7>xNpIj1>nTu*;4oHJuygE>%1T1(T}ws_HS?p(w@-n=5*8Wmbb^83!LQXhL6ee$ zV&b)>RNqn9*Cq&{-Px6b&$|9LB}bV@D8yMg97v#>KMUlka@7y;A>^B@&)07ZcKwSJ z2JTO;I`)`kZL&iyg5uv`gSUC~^BY1>?C$6$+-z5Aeo+44$U2!Ty21vDpvj4C4RJTX zK)<+l4I!@~$;;X(hFzEFsT?T@c-Xkt<|ATCV+jak5#r5!n@8~dA-R5fA)I{zX~ZQU z*WAmZ&-Thq3_TW*sZ{iJhx*BOO!}1w^h%-piiq3(ei|CN7wMK22^FAjSzD_o{LneI zajr*el`X_F;Y33|)>CnW33B#2b9<76V@CS&;eMq2+uCbQGQeKrJ}BKJuFHjGJd zFD9B?#LYApBmB*3t5Y&U0$TPDPHCG=kh5;I1*;jpe0Nr1|H3i}REUQ$HUY{T&<(kO zGrypaK~O$GB_gE4+A6cJ%|lVuE&gqe*&mBX7sAaC)&_+OL9O>+(nMfJK!Pi~awJ9w zccAWKyJ-3l-C^Qx2KKeN%T@pQ*f|8*m0X+_eoaov`QF64Wxl^rib`fWF+qkg?aq#_ zqKKdze=(4AWOIODnA*jt38lTCQwwUf5lI0svo zKx&oepO5M*;{6!i_j()Dy%+Yqao(zXiEI`!0$H8|!Ec}SD0 zOd@yM+LEY4ks#fRCoXsb3Kyi@)JWLHuS@A?ox9$VNrXW#O{WlWXts}hcJMfldO29|IC^a51pbfmK7QV;}6eZ(& z!3#HI_66F0pSpZ*(CqJ}*ER`p5SIvsOA<}hi$2PX=Hj&5lEwQIrrdp)42R#1RJ`jW zLf9oC{oCaaAB|%4AAG?S2oV&IoZDhG;rPlqR&oI^xElRYQY?>e0ucS6{tgrt(v#vW z!K2hhQAUJLso5ex)aEGqs?%x29OX18K6v#Y#Mwu(XJk|$Lc`=Ww^X8i$-EB|OT#voqRGt&CmHly9-r86>0e{9x#FFk^$BMsXMYaZ&_reIMfv_B zTX{5?SN=ws+eVvHobS~2mw8>W&>z;W=BZT`^4^})i<#pAiuz8KrH3G#@00_s{L;hF zC#IshDxF^Jd{Q$fzv^E8Ri;$nK&SjnpvRIZoGSVfrL3Xg3gXf<;>_`!p_|Bji6F<5 zlN=oIWaB!GKLsX9*|eIlO3$-&K{BCW|ANceXeyiYg=dlc(w-L#?h8W7q6u+o-S+B8 zri4oi>K#G=2DCgmGGc6SxdF)nhb!Jcl41!0y#6Axy+1UrnbC!UVMR>T`IXoMJ3;$a zmnl17So0YPN;|TM`ZW*zqlVf~euyrH0*b=(If-%AKO1g4C+eD+M^j~30u6Kb{Li;P z)Dk17QFx(Kg!Rr&7PNiIeqkum88}>AO1E4Y$MM+$<=2@F+H8*nOKP;@ zwxaj0ZWwXJBfQ6rc!kpsQX#vGTa@2UiI1bOPX_F_mbGLhVB z8jchAX%1VvJJa74klfH=dpnL&t}D1$BqeSX;G|}C-DtSlQZPOF+Ub*rEOO=N5isiR ztAC^7#VzYf&`)0-cWWcFuSKA(+V|Bx@T=;W)$pMCr8*KPTyjC#QPgjvZ;dGJ=Gzc= zZp(!Y5q>qX=z`N7CaZVWgBG=txym}sHiRdzSzxi5ar>b1vGfXR%i2hJGk=#UoBUoc zA%WofAZqq{{b_q*l}2Y9T``JZebgXu^!byTRT+odj9%E~<%mVtC1DpoF@omMkNjPq zpWvw2);SV#YEmC>=Y`(pW8cq|yv~|)Wa`MPMC%q3=JOv6@q=8Ac%?4U<+s*)p z$z;~Mb{mpgwb-$5C(yWnEbPV{tklVUR;phuo-De8=jXV`rIoRFb=W(a`2I?cE9Bul zo58KH??IZrkKvm|>Ls$Dw3@Ep<^`jxd(mO9K3VZ9t$71Gy$;>7zff3YGrJ%$3NZwb z!yE$w#=c9d;X?|#D15|oBPZCty$Th?d z;IrI$Hwc#r4xL~uA5)_Yye*wKI5FmegT^|JQ0$4A1Pg~qUpA_uOBT_4NASRgNU0eg zN3wAO1X$(VP+LmkH~~5NAKFldv?z~BzX-hvB>f)*<-;?H+%()Pj(LG$IP$E6{}SSG zMXJf)4*P}H8zIp=7e0hoHcl4qBDmi?yJ@KVydrq%&x?OsL~3VX7K$=6J zlOvnlGh@lcSt`hyJ!l@X+escuGC ztv8ssyP$ooWCjH4OQ%RZY)QQQ^YB#zh0kD)OBfItD)8R9@THd|T9U83(eBDmhDyXrBl@Ms`N*00_9*6|r{;)OMOB2iP?_rnirEyNj4-Kvd^ME7et?++@dIJ6Q^|C zh>zlax53$F(DWo=taQjux>u1i2DDrhar1Wk?!CC~+;c^Aq7r>=Q|@*xrvcCD5o-jM zvLE)!^QL1YqJ&MSdqfs|a*ZbbxlTyB$+5?8_5l&K`BN{n+=&=4DuZf=I&d{Z6m5evnUi=-aP7%I_ z_{By|;QiGSxBj(_xvY3jJ6yuEo>tUV&2&2|Vqlm2%Ow4h)dWWk$(e*(MlYK+%-SW|h z%+}93o_XzruP@X2*6t?;bnt>sIrURS7Hi=(afvh8n*FGxQ+Aj&&T)%+H<2nkW&#wr z=;V)2#B7;iMJWIJ4#5Zw7cA>iL(97&Z$cC06MhUS zvZlThjl1Py|F7=MmQToR%Mil0ot_;=)7eYD?Lh)2bSU5;U^@_+iLlH920M9DS35%lHLGGh0QhZHJAu>e>g(bL#@i4!P>6(w@J) z<_J*yC1~J#a@oTLoNMdj|SIJJx0;>z9T3XvLci32BOff6ZPt3u$Q>6=vHC z@Qxw@Ph3p!ly?3RX5U1P_V1j#cy{EfT<8MI;m=yb8nv`9?h@YE*En~SWKmHoWm2O% z7}D&s7C&$j6Za2)a+xZ`d76jjC@YM`|IobJ`WebSK$6-!V*k?yRwB7TrI$AGZ9={e z7jX>Acv@CYBnUJ;%aVR&mUM>k3;V@aX*OdID1BYfE{6PeY{QqDiK90A3eNoL4_N~n zWWHH+OaU%n^y*|*KYig;CU~HvRCEftRhyzx#3b~aY(mO;)})n*y1o#tcR;EK#4MCy zuSM&JWxJT7uqE9&GbcIQNJNGM+Z9**5PS}n#~V6$ zAYBA=0)C+HqpoF&0vaMkZ%IqOgzOYf5zd+w&{~N)4-9wCYk>_g$E?J`k)&nD=Mah6MX@e~7`b4K~Z^cJ$m<=^BY$zA6vB#DI&22$mn_idE!; zS!Ao~X+dsGJy-j~LbL9`#WiXwSZIP5m~!CfjaZgtU7(`^^m$*;nTHEGn{@OrIR{pT z{|IwPzV`a*@h{{NOdGET?4+}|0o+rZ(nZ7JLMRXQA$MhPFCeN9mV?4x;-dC2uN~cs z5$S^(a+%e;De{rKX|7y;zN6Z--j<>iMPRAW^*-d>Y*k#b-t;>65Yp5hZ$=M5P5G}Z z^nE7<`Ylb{Addn{-k>)oR;%J^2N|YPY z&tSDf=|PZfFW#x7QHfZNLq5zvcJ={PN51wo_4@(R#pc}@xWm{=HJx%Wh;TvtrdB*iIvn!7R|wgfr{_UyKM_Cr1rps9u7`0j{t-g~57FI% zp->&VT8#pIp`MjwI$S)?vgQ`ZyHblivHZln;g1%|>o`nnh|DvoC>d^CSDEF$1pcad zA*10>=x@^ZLI!RP9OLuA9y&rsnwlOpyO`(LN+Y^k%*ciJ@Ti4f<$SLNb>r_7kkJav z%C@(TT2nd6O1%x|I;^f>zsPf%9Rqbp?AGGFLUJ}`a0jLiM~ueKMRgqq`4z*mWa^3( zeG~V1F@=7YXQZN;wtD98;zu);J9VxX=f%_AuOE}Js&u^p*`C?<;Z|yc-1G+dwf4c&x%+(hXcxo|?i3b+F zklZUg)y~-q3w@a^VjlAj54oIzE_7{nZ44!t?nrk#K#vW93cC*Oxy25Cn8*Nj5KSqH)+SjJv4u#at)bO4kc>qbpEd)QU0~Zz{*gb?T)C zQ)U?Al%7lv=7=jJo5sFNw{wI{yX$L6Mj_Z}Hl>p8jIL2CL*@h-y3LC9zAuLdWek+Q zTijMjdr6o;{9bH>DSd@`o}sB7rKKD1Pwa4>DsrHH&u7U38NO`@xp z+|J1FVtSWm#W!7fEIf|=sP)c638 zaqsieHUXmvKoXAh)J)aMN9Z|XCBG(Q4fcsE`?dk#nD9Rvvu?28jyc7T8{phgeDD(GL0Lh%|x zQcN~`3d(26D(>TgvG$YwfYUC&+7v8jRY`#AbR!*+YBtOqzr~?MOXd%B7GQQ@H1G>` zK&|9DIE*!|%|917;K=!`Hquuozxn$!{C59OHcpdyqOz9Wdv#s%eIlh@wFN*Mb3INS6!tpF)~OFil`cpP9b zor*f&{wv`fGjpjof;~NuzV0+@8aINvC(>U2_o9K2-^?Pm3b%`zY(kBJW2M)zN?G~( z%dZsJ{VFT@hQ+M$@H^d_6pvwrKscJ(YeJGHEGZ~!l7zCu3GxBxw*GdJ4K}-)$t%{6 z_SUps6hcF3nUUUtlH8~P2Px#~&v<|?ND>5<)O24Q|1Un=0{$sc5AU|Wu-wA#*0<>E zzBWR0(t%J7u~&)u8btb-a^LPh{cRb8M(H`Q_l2gj{>L50j`77RvVVc^AVd}1q3MuM zGWgZYH#UiU^0r*246pZ1Kru;|W+aNB=liabht{ke$3G1mU@sG zT+iVI_2aX9;aDHGdzl@9Y_7%T`NYgUTjrGO9l@(kamMby6#|db&$Q8pIrBn|@5R@n zhZ?2{`_oj`Og(8NZjTPR=Ny4mv~|qDxVPLcoF&|uA7kna>wIor7sK&1O(O!w+qHK? z`q`d9?K`2Uqoyv_EhR;4JgB31%ErPj=>$UK9P_g}&T4 zl7HpV`Em(doD%?KGEEua1l#hi_Xy`=rNAg}5nv}kAq3-SdDQ;Cf`$%&H zj6JVCfZ}=YE#p}Ct#7~|@xL1O$4ju@hL#uk#y??Pe819Jw^6q;6Ab!525Pa@oYN|5 zaERzMD0h;%DMYsRH1}5vE3nkjw4h`3BoC|c782l4XQ0& z7^G!u447bRyDN!at~j_Fk2`#d1gH?v5Dr2H9=+42>>7TBm@oqmw#pvE8kt36XBw(x zY%79A8`5u&iu0B68#BXCScjm_7~FxHu*dI&<$s6|Ck(0Y^S5J%QwFWEYZ%ysm)>nl z!V-$ibVNkqk6{XLi|wkhH0k2^8hH{G1fyl-5W59+;zXf5GG3&lm34t}GyOARl;~Gl zU>cnxy%$|)#RQO++zC`{&FW=Yl-rZ@qc+93>leq$ zLAEFS9$qrwb<9xHkNQrZ$k#k|wYv_@{>sd1$@iBn`*=1Yk+D?J74Pv^p89!7u`aUz zF8zq-kU#sDXuLxPW%bu#!%DlLV2g=e8E=q!LaAdtBQo&skSdFk{z|;RgX}UL#!uDn zg?`wGoQyUZeaQ4{gapb*pRNe+=GgBo-$8qLRRXId(-YB1->i>`IG31j54kkn$x|5b zmOhQoQuZ2f6c6UQ3|DQ;1@rCofco66g$VvKt6+%dn9NXkQiaQHD%>o&WfFLg^E17g zb3ZcTWJ(~PZ+m6fL1K3ZM5_az_iTUx`VtX0M2zxd;u79;Ef%Q~^kc4X_`V7L6!JcC z5eyQbkpWbthpYDfl296SFc7D^x2N!2{7JwSS!8A%!5~JoS{|PPXM|&{U)84U=eDw~ z=BXA8yJ;g-wu609zOG<8b*Wl8J}Fp9a5MJ}iL@h+XUaBqCr zVMIV%y<dvv*u=tv6Y#!SnG#2UeGI>(`-2FVAgl{L zO9O?)yy~nvk#B6BIUn?wWsAH_uaEqtrtG8Tkt)3;@FMZ7fxeK<=3a@tS|{canBjL_LaNtu0rE!EB?&2u?Zf%(RCv3If7H zN!G|b(mz}zPPh5NXLy>kY8#xn=E(c5;Kj)79!AyD5MgI83Oy~9pB`JN(eCQtn47S4< zyxq`RA>t$`tx94Rs9D8u;?(mhKET_06Ad(8V8Lc1mB7>$TJ{~beZQ=gRBqd`5KxZ^ zhe2zE{0fr34j1GtDgLfZ_~ZB~yr#8KW71{I8{uFaEt*_^AvO|s)Pfz zA4Yf_QwQ|WiZ>7-P+p7rsuDj#PBe|}w#D==g(e~n- zJpCc0e*d~boPT=9@Z_ZH=L4u@8)2DCM`*G*89sgElYxffPqgC*beI|gVUoYsgm}70 zD>rNLSS*Ucr2LRfbpKWe(dcJfc<4Uq}f6*i_ou+8kYR8 zd)z((+MdsjZ}GQCL=j-Kqca_FO>}t-tB>mOhO~89ndtbomj#pw#ZYNE{HggxCW)5p zoI9fwN?^r3th0`}uU&n3R$s1YjFAhw#gDH=%!G!1(e3(^4lyXkw(-LO!TMMGLR-k? zE>wA6`Eb)+y;g+=XP$_?hO>^9fj2W5oyk}irOn*3wCpW^-xB~i;mlLLG52CnT}hSYwq*IyAM`W*)Ns@xCw9#~MsKw#A}BgzM{M4_(LfEPXiR@}XRv*@NT&j<>!EwL zwIh7qSElO-C$HB9`=20cz6-&)&XO<_4l!(glC6Z5K59r0@(mddF?D5rY1*0Cv}^^& zhE6}$Tzfp@*a0w+_QQ>^GTxOpm3H!O_EVI)&+h1NiJ0oYh^|bdv}9|yDMgDm^YDg*Z!)D5gtqa8FM4&w z`Q22hiI5-)idX9gQ&%6}NaWIOBeN4mfk;1$%d@6k#Q-4hkCdG$Q>xHD=S3;Pc-i2* zd#diPnr3TDM%QR+xwHLsnK63t=#pE9M|GBPTC&g({4H8vN89m%^jG&|&7_NmgAINT z%nU>BBr@s;ZDo_&%IZ>GnWh|EiBjXAKFBi}BALO&YcyUL886#=BCYpO0!rCT9c<>9 zbiB?EK_b9o8d|J-x7NyIE<$HiQacJ-V<%S%s&oz`$jx0V`+17sw$}nf<~R;^&kxCb z&0d~8bJAJj4-cF8HQf>inLi9>o@;({h;ZJ=P$DfY$l%iUJz-IIg~68%Q>aFBccY~# zH*BGx8@#h$%>&t}N5Ti|tyfe4_N?i8v5?Qi6%m4bz2m(eY9Ds1sJ)Gh8yqNhPg z-pJ536I3th_O!E~d_iQ;uxajy{@9N%@Iaa!ysb|}&@fz(6Tkh=!fK_j4s`ND%U)OV zL|vrL`g^O-)^p`+X|G4W82)fn@FFH+b`y|(IqTytpQ=nygHKe!NMH|+su}zM!AtV| z`Uq2jSODo67gBg^McC>jU&}rjo>4%p$0DpvlNqks5amlSy?Tj>3>~$gojKdk~Rp_mttY zeo5tOf9PLB6_wp!F?U%@`U;ZF*l{>f86E_l0AWl5QVqGbP`0$@z_7TQR}G}Q&Lio& z5s%5S_>4;H7&qDnSC>EIC^%z)EZp#BudgODGRCHtWRs zrd0gGS{T32Chi>ySOglyqesDLrZ|LM`DZzso(?@u3Hke|)&?vO)LfzHMI-hB4TxeJ1grCd`qOje(e#xUL+(6;L=0U;L!9hfDG*JC`kHJL4p<8D&r93Hxg@sI@-%mgBEe6e-V(m7$a{1Cqi*)EmmG0(b%k?Y}c7auvSm9F z=}xU*_U$JrkuO}Uq3xn8e8{E^#(TVzZ=~ae!VhTwRh-E!^E<5A> zVV79NyLQxXw=&h*YtzZr0b1cG4z&n;>4gT>6YJoo6C??28m2_kU0sgJfBTXWB@qSe zA)g4U?u;N& zG0W}2M}c^x%t6jJ+nMQFbTm<_qLGgkP9-XwHDngYiwFjLK$KJLbQM zs+e1UO^{1eS!+~rDf6!e4KDU~9(C0d35rd5DjHB{r`hpXe;!e(eWTIm=(QGkYp#Ae z2HqeTI3kA?k~6$_*9*mzm{3A;)wHdz;<(xBrm#?hx;JgNNLXQWW=g;?AW&n%hz8~! z?ggw$3zHs|YCBOcg+9If0O#tYcDP~?GBkxH8Td2`OwN& z3rp|j$Jl|dH_ssJuJk0N;cfC0JdT+<}*4RUjx}3yQ_(j zP3{$IbqdMaHS)ajeuej~o*!cvrEU+(0<4(bIRX+|NShQRjkwII(I*DZ&=76`zD4My z`<_Gne=ukCR~4V1$z8H4<5oS=q+)W=wLWcBN?XFWPMoJgg`MeeJaK8IDWJ4u)fsVA z2ujLE99p)s23U)~&^6sGD26P2tA5(DYlsijg?_5w-6}lS{?b<~TnNNXdbj zN;j6l#jJ*@KQ5OtvWQrN?0LEHcbA|7(bM5Ne?r-O1g7g43MsSXUpZvEv2^xdX3BVU zd=ML|cD;l*Ixksh>n-D6UGB#b)K8u+lJHYq$PL(izZ!@>>f>0M8F#@ZS=Q?vJ07Y7 zez(i=e>>JJ`DMf`XiCz4Yn_QEyG*Ua+TIH ze_Mg-Jz4Myv74)!-;I!jJ$`I1nZZNyVoL#HBB4@I0k>XR!-3xj8hu#yW{VkzG1*Zv z&O{L~JG-E9cEwpYl-LA%WRYp{aVP6dygi%dz#|O&tU|CjZPFGoJzyD`i&V1S()WJA z{Q$gjkRtn+K?jfDW8X3F&Eof9@SGS(f4?NiqMH<4X2b9G4wS3}6$SmI?N_nj0ZHC@ z)AH}k>h-aWaCJc#O-N(@-X(qwwvhGC$e8Q>GE$ksm=1Q|_Lioc807zB99}i!g=M8~ z^ZW_UqACN7ixPDD6&l|;Q!j+DCf3%A$(8~C*hG~VV60X@W7SbmJ@cJ|egjv-fBUo4 znJ#?`so?l3OfF7(+F1WOnu^?=tm4=~{+@>6edXSW4RPHvWsy1zc@HoWVATIln6ZK4S;8bO?%q-!Sb?fa9x6`os zigZdN{V9kYoF8AmJL2y4HS6wte*?_DmG2;nO2pr9_{$tU=yJ!q-HKN@T_Qc){(?E& zr=z7eVr>-`iUcHl`EL0sCIyKx@Z4*dz8M@Lu8u8#7kus5&-~6LQt{ip$Ef<^FxgvM zIz9!0biE~|YdOZ}NuuBI(Z8vkjpr|zp?}@6qFGeuV)A;>X()qaNe&gue@oG+-1K>5 zypp2Bh*@4hJ-=>|F}1E89jMJ}qyxN1pFs_1_(B+A%w$VQhdS!(Pk*SmqB4Ksq>%=X z!?vG)r7+_)iQ(JU;_wDF67+v$9u^@+0=xi}rOF5&lmv)gQQk?vT1b4}ktG%%EoOr5 zJlj+6QR)>n9NVPTR1Bxof8ZjdHv9e4NkRR!(BddnE#9MIB3=ovXJyJ6> zjwrUkY2q8fB?zcVNlJsP);mi~mSGSJ)&r@V8=_$$ndOWm7%$%|N0+*sV><@5oGX{Z zeNTp+v~Bv4O%0cDm|XK@Au^QMZ(LCjFp`Zf!sUTu*&+km23DKST(y?j&pG+6{8T9? z=v*TJx{YV(c>jZw2)Ne&2QTI40JANUe+DixFfuSQF*G$bGB7JJF()uECn*XqO>bmG zVRU66C`39kFfuYQGB7eRG&MCcFq7Jt$OAYaAd`;@Cx7i$O>g5i5WVYH@YqEWjYvwA z1OyAj_B!2<>}G8zUDWVFkr`X4EvXWnb^7Z&Ln@B!Y*1{^J%}cU!#6YU$wNgfU?EEr z#+l$D6F8lTAs!-P1Q{6EZ3MbZcsR5bW>s>cKL2^E~h3l8tWs22V=~Hou4xp3o62T zjYY~5J4Il)ZmN8$4Vxp{Xv|=y_l7N8%;$kIsDD@Yb+fu-^Fve^>4zfMlgq(^pyrW? z*#FOedKSoD6#0rJQu?wRMI8GHlL6VIAb9b_5x&h~=!b44gfCcZ&(Jyz1kQNCrJr_Y z(HqIdg?XO!y=0Q$IX7DP{$4M=A0_tMBVJ_TP{Wu~ccZgVX>pKHPOdCPRj4FDDpW*; z#eYfY$JS3O?sFFP>QqQ-%2l`D+3eX4T`D?Z3GeVDM<)-&Av`{c6ZCVTr?Vaf7OYdF zH}9!utw)-QXN*pY=-AtN$dqF#GPQV4klwBoPQLZ+cA^g!zrdP;YiEG|)m@?4jLc?HUZ2b)z+y?-lIeJM7^R z{?<+P#xXZ8uwE{EUHe~k_%IyZe))zE}vAAjq$E)Sd@=zOB=iKrbs5J^Y;L_g*S^V{7SyCfqf zHpp6SZ?Mz1^@vTntnIBjE}GU*F(`otpR%WgK&an~rCGOhy<68=byF63wZv@VpU>u0 zvy*hxX*%Jn>F^inBrnsUkBXMQDt7cu;K9dY*|NE1adGT9%uw)Z<+&rh?+1r}0CU4N zDU(H=Y(q9fL^3lnMM5=1H83zRF+(v#Gebl)G&n>#K{zurI6*!jJT^l_GBYwoLN!D+ zFfcGNLoq}%Lqs$*I7B%?I5RUiL6h{IR8`U!5Str_S%H|1ky{VMXJ-u62hkji-A5Q0 z`@VzNJd6{cGcZnj%m4r&0tx&IWo~41baG{3Z3<;>WN%_>3UhQ}a&&ldWo8O9I5sjj K3MC~)Peuyjxf=!m delta 38599 zcmV(~K+nIaumYXE0+1sFH8eCJlW(;IAp~Lw=hl3sZz$e3B zI5eUx>CN~^>1 zN3ot~hJxkgK6;Y5px@5)@Gl~8b^%*1zPuF+ml&J^KHa+MY4 zj1s50VvWKLtg=?vB4q=v%@w*x*`RB4g)UMy=-OPPH&V9f%3Ps~lnuHze^=-tWrMEG z75d9EzMJr)bVN{U&Gs2zU=L6Ef_=oCFW|>a`U+jo`lf1y+8~K>TB}T|FIWw#=F}Ig z23k{^saFlQVKh^(TJY8~R`e*->#8=l8In_iR45&s^0bNsI;f9K zn`huD40>aZ4#$5oGtV=F5~j)9p6CQUC7LJ{26IFOmP{(ql3>Z#&oV(Jl=Yi{QCp&y zCTIpFOd`scm`>mmf2Ns3p)ki(N={U)wT3-M<|^*zkqyOG>xInCpoFP2_n@OW0Z(Wq z0%iQO^_KajMu&pS#d9RC#->jD67I{RTu9suN|;DfF7ce8rv*1rC=BL^jvDjW*A0Q> zf5dKYuJkCve4?V`#yTw*N>2o3OkhU4lGB1(LTR9RobzN1f1zlX{8}0Xc>)`BsN6Rl zTEiDxU=u+Z6XVp=X;Bk8aj0V%>o!IXIZMUYG2%=RYt2?h850=sB}SE}8AfHS+ZZv4 z&AH9WTk4(GL#LaQK3D=cWyAV)&XM%>%UP6;{@CoHE}~3L7!k%zzDQ{&T1zc zjPj2iEClipDbdzjEhZ4uFsXzH5XHS2`@06>n`k3(f1QWyH~Xzk&Oxd8k4zwQp8FON3LJEyyQ{braS|5^DY9z@Q9=22Qq&829ZFHztXP9jo6fw)!%ng>cb} zFcMw_^12bm5TpIZAZVv3yzn+kXilu6I!kBZgt_)cN+JDHs5|A59G;vfjAh?vU}FfG z=pRhrf7bqwF)9iw&?*@P2NZZ_Neu52RhMluI&Ue{)SW?{iOKV;Uh=`1V(Ksb^6VMN-MZ zi=wtu37sTMGHG~`vd=y1o6e==^YaydW8$TQw^2A69)kI7L;u5k&o-x#MZm#iA=xlQ z#>7-X3(bW{Nx}tM4JO!3vN5MqWrJRf@kzdW6{Xl<;8io``ec(jN+h*}7bdz~bJFry ze<>8IwPw_G$%dwuE~yx*URXR>EPh@&8*Mw-c4JdIpI;Uon17owE1h2&vvL+*jIzCQETy0H@j^}N)6ej^ zS=#q2*cS<{{bHoz)2Q60jCHT_!1E(ae|3j#AZi(LlrKC#ilB@M{mTCF`tHUYe ztGQVFPl?^l%e(G@QhRyZ$RrYDVKM)pMG}f8i2Jox+NbqqsNz^69KK+U6k>sFMrBZ7 z`_t;``u2}V_2l|ykDJ|e^eA0oQTt|4`%>!PPGaUN`vors!$O72eebq; zFL?gQb`V=wY}kEf`qu`0lkkQ(LwPFN=(Ge^JxBqRI+JcrY&1qSH+-OJWRt1FeIfI& z!oriyCuNTbEiEVr^iNzawDuX5e?cL8jy~M3j!*Aib%`-Ww~3Jky&3c2FyTBV(!Sdk z&Q5~zV2}At1qJiLO7TEY5~v9ZUkpld`FufnYWw;v#fm%ZsifK-s|;#lC6{7VxqN!8 zOjp&KruhFV2I6aDfWC+>J-*zinm|nq>|zXx%lR0Hr_Gl~b=?2^;AphGf9@+M2+niw z%V4ctoF5UCFzx!H``9$-NoeAbpIpkew(;LqK$L%n#vh1C0yQBKd@&@&=adg;{usq!IV{L$^nZJpIj_=f${x zQ-j1=dL0-L0T4t!c=7Aie|2ob(Lnh(ncu}-$eTU3dvU(n8`!g8cw|v!Ok`Bf4Y2@4Swf`iKJ3l zczu0!b8)%4&9s?1J`bC-mp^b@zpcV#lRl%uRpZ}8ykPwP=kMR_zdMMx7|*V6R_m*q zbnfc-_TJ8S~~cz=6w>a|YxPrv7{cXunEF81F=Wyhk>996gqxV|3Yj<<(cYTjSm zA77pa%3~UQL%fE5f9|e6In6iQm`*=-s4QD$I_oSKXIICavzs3G>HXEktdTcy)7|ht zUG|r)*B9MAKOy~Yu5V7(_mN6E8SYRVOb*+@<49H~eOQjG$NTclW2S zFfmvT=h-hmz77>H&+q01eVM=0chyo0=7<*91AEbGB7p@FHB`_ zXLM*XATc;LGd2n@Ol59obZ9dmFbXeBWo~D5Xdp2;IW;+xa8)ONjI{%F9qQUG+*pn6 z72CFL+gVX#+iKF-b{boajmEa!#&&M@+2=dm=llOL?j0FpWzEO)!Mn(a6jkViOzn(; z;&!&qbWHS&+yD_dWhO=dBO?nvBO@~`8JVi3vo-MFVpuYDprezeoh|o23`87(M$R8H zQ6uLMJ2^XBfV7K$HGqi)z{JkY#KFzT2w-Mp{>$0O#Kg|V-pJO&($*YcW@!xsD2U6_JG(p60*q`;|1vbP zcC!2MH*z(9va~ib{xJAg9 zXJZ4jb#{XNt3FXnN1(|^+dUZmp6nM}J2zXee*-g1TT`>YNSM0VGpN~GI=BF(ME`B_ zA%guQGY2{Y*ccfZIk;E=KnDQO-Nb_7ukflK_P~FCDVhEfe=zXzvbVDbn0=4{`dFF) zKmNgbIT^VE0nUytKp(GvD*hY6GBE*6Elr#O#z1pRTi8F*Kg2+@zwyWL9WC7fI*cE? z#{^*f>+e6`^ghmM*yNrMMO4*v(0l5CI_M=VzQ?cv67l88L$3X@7?^yD7A6p9qQ2tqR zT}C!WlaCLk|L0Zzm&^a(hW{1i|F-1+yC88FYwLfhDgUMa|EP^@EUi8MZSb+QF3ukZ zAZPb+2Dbm(R0H_;(aHf$EnRH>w^qv8=;IK7glx^N|I^5x^{aQaxuf2n{Um-F8##cWONO#eDDW;S+! zk)xxL2kgg?KM)(hi|OMQO@Z$JT4DeLy{(<|hYR3?o)5sx&Jp&niL$c;7=-^4{f#(( z01P635Ep_3Qu3BVxn2QdQ}B>x~50E5&Y#0FrH`GY=U%Kbr{00#O0AWlXAgW@0b z5mWgO`iQCW2eASeRR5rl3~GPS2M6^(=p&!TAM}w=^FN5|Lud4d-$zU%=Re91Q{zAA zgS^Rq5bKBO$8%=$$N8^eF_``XezXUF_#gN&3x>Zp((mS|(EckzHJ_wpwy8dxu z`b+Q+{YN{@>|7lGk>kU_{2%b6D2qQOe)Pn`!`|ZKx&CAEA+!7k{9tJP5BNdR<`4XM zG#LKj`|(OK*!_wAVf*nT_7C+(b@qSMAHMbOm^*#EnEwg*F=;1jBPWZ0g!oAN zr-+ZIh{4&y5%`apeQ|_Jp?O*i2C99f=m);tz5q%$2pG&RbFJL25!-7HGZ^AnIO1ZD})_fg`a3dUcyYB&u^Q%NHR2QKy;kdi$MvkC{~p z-U`=D{X1X|Ud!UQe@lRCK z81$m09)z;-J8VmrL!?ElQWAapi&Y~Lx`7^6k6Z=xXX=$|ZLw?xN<@1nk#yrnjRC{L zK_UHIccw=>cje`O!usgPcIuB8m2Tm}BhlNc9Km^CJC*%&x}6l#B!nVCi|=>m`(TVo zPF00niENr<5H(c!o_5E(i(lMm%cMlfD2kTl24s$T!@QUEmPCvCuxtDUF}~h5`!m;d zC_GSp`kaZh9!-{cunId^{H=vab|p%Xmom))XTb3_d9EU z3xJ31TN;>k968bAu7CRVCLiUuh&Qu{mJKyGp@tQ_+KjnD;O5Ej7`QMAR0j(q8vn$J zB%rXAB6CBwhz9L_AtBpp$1_Xti7=*xugXccZUgO>H>8|Tw_6K@bW>CyT~!Rt-6B6KqHks)nt{+JN63-T zpLTiYDbR-NVv;YFQ59Ll@`!@zxNDIqkhk8Zgb<_=tLY(@`|CucCq79#mwh7d^WuPv zWv_)L2~eNvo9g~OzaTk}n{XY;Q2G-=e9_#ESez|?F~;puC7jf|`aQr-iFe{~A_Wd^ z+R(R&H;aIYvD6zn?VY#n*yg;U`Ee<8W5nWOBR6=n^4K4`v9YEmLtq5kP~e&)`8Nif z!B{SYPEzMJr@JCAE7LswHJ|h3gXE_bM0g(dv>z=#jxsJx$1Kn3Y9gN?EYmR>B%Lek zre}SB?mVyv7T$bK2*`9d9nSrATU!rU6KUK8QC17spbYuOV1CEbG zJ`CYNW6O*!k~5Q;CmL6KhTTh#VW-i$*!jv;F!y75Zn(s!<%Z3%Q>V947sC(q!k16K zb9GIWYzp)j$RuKzP+^Tu-f!z(nCCcGk_(jXMo;$NJljI5{&wp~IA|x$HI>&DX~`&m zoLEt~TbL9q(qMKcJmPd9k~N*F+Qqf|?YqaM&hTwmm`||w=INA(F}_DD&(8Ca^z?D+ ze#oD!zp`t31l?_WW1CseuB>kpP-XBcAZ0O%=yH@NY;z_KiOJo_gaEyiYzkdE?7@B? z*uAm=z!!%v3C5DOWmCCwBQK@Ki*J}BevfkOwO#7PLjI{?!D~+e z1%Qys1SachGH-(pVcA&RO+HL4ck;`oU$9wYKC}|6u+nDDn2d2b6@vQ7F#R`Fe_q07 z0)yFYUzT+TnW7kT9yfuy5S5CsEW-hB8O9kKYMDs3P{HPXmNie6*Jw8!B}walk|u<$ zIAlCAbmXVhPr;mC%1d_f$UXh4n3Q`48W*6(LT`RD-ga_u2B-_ODH_O|Z+LMX@E&_| zNAK+Z<+-)`9rs4g*@V!73QWKA7_vTzUlJxZv3L-Wr0tp@%E>KGNepPeuIqr=RJ^x( zPJR>jgM>k`E_*B|?v>`R0)j7psSC$w2;=!2s#C>+uc-{LT4K}Yod;}uKUYS<($EaYCm zlg;+T){92ID0761D9SGja+MeDjFgk$;3AchwR|H_5Lzdjntuwlkjr$FR|VQdr)CUN)4v$>;`O@R5EOH8hQK+QNx{j$={A%dpn%0P3lh1JI|vb_wru zQRMxSR+!?`4eW$Ch?-db$@SZ}i&>eV?s-JXZxnTn{2ke~A zr_y16D;wqUy=252H&-w8zhoG1^xFRN#W*#TtuD}mUNtII%LDl_;Pn!Y;<|G#ySdhS zkk6WM!<0(uhw5k}^nT`)gw{KFePye?2}YgP##eg()9LoYE(6V2&@0Yk0S=l-M1(NX z3)2LeUc*nj>ExW?fDzLcZp+v{#N0_S)s&QfZd&)QdSWhvT$b0q3ml~d(Hix$_YdEU z2F{}=`jP(RCK|JeAy1DI*C4`egF0c)s-Iywp*PF|u5<0skf6V=+iw@c`ZPg|3Wzwd z=L>CMlUKvA%-xoWpkSdNIQGS1v;;j=^n5QvazN86AFb_#7mh0uU4XQGJt(@f`b&o7qI>^>(s*g$S(PAIKts|(C1w%6VOzd_Ie6HX z)FqxR*MWRsN}fnNHDQx~;3~7{XXcP@6MnsJbYcKIi)b>ZyeV}@SE5C5=GUp(;T%tp za2jX{;3iZi<b=t_iInYE7S_jC7B;f(`t4qCkh`dr z#a+iH_q`f?^Wbb1?K}bsHM*@5u@Tp=$*{Hz&4h0lm=hoeD3z&O-8k5OlKe zw#ktx0uTlp82SbwtLA9&jlS6!gE2f&h6FWJrzJANVH+3h!2YJ5HNxt+7FhnoBkL|D z`?ye?@_fHps_xLHtjETpD~2c@C>3mq&-y=Zx%ZsGrE9GOf1$aT*9VDFGD`g9nP^2{ zRdN80E*(nzqDt}OA{&V(n>W^f0oQhE&n3XwT2_Q9L9-vgHja}5>kzlt?=VvoAg~uX z<;f(XLeq^`pvctcO8^`JRbd{7V=0$TmRtE5oMcUTIon7IP0$J)QQBv)v6p9);Q-Ni z1x9#N^{p;#7)3}XS8i$ln5|}YiBj|F8t6alyD5Q0Qv*yBjV5a)o&5TL^hh)ZA_YCC zwO=xugDpdLWKtwn-Eu#qXIbPpi8o^3_8Kp4|e3YE1e)1uoC^i^?u z^Dtrq$SG<`l+I7_5O9G<;pu^wPGf3)!iWqJWb=<--<+nzfqlq~Uzu8{uFjkMl2Wx|-OodwTus7zxRPFT1&pOKo8A-m%rI;o zY8J@fl`{Dxs3J+8%=-?}9)AW7bhZ7&i5ie49*C#At6p2iGH`8FDMaMq(Z0)lIZK0< zZ9-JwegTmjXgn$Rw(xbE!OVwcGdbh;Xz|c4u|#Pya^)d~nVi#qG}IiSriR+9&&yq7 zbBk7gW0XFe|J)*()%XyRWaoLLg5*2&$aplN!DCY-&rw+2#Fby%j&zG_9ztD!-Wmxk z=`vIq;wpuRg+KsX8gH-))+>J7>1a*QrC>rORu)XUQALA49a@@@pGidMv9+BBNP`^Z zRScx#r{?b@tA>kzQ?^=OR;z|A-qxV5_*I6j6jIi?uYOy#MEeaddAGBQy+2R967wO& zs#DMTChU!l^x*=QEPg9|0W-{MKFip(e|Y*_$?4*;M(TK#yndbJ0&C`D;?PIK*5lzz z^t!G$pY??M(0ziSMq3He>6}Y-V970cmD{Nh(!WYsWdKxv2sp+USMPjtn1aOMuW&iq z>Zg<74&T1)_YBGo$&l36C5urzbzSQIL;yeKxvP)0H7XqCLFO;7y3^9c!iQC*`Fl&C z-3CoM{sDt-mL>Qe-*R_RQgAIcz-%(;^-GH~{qKz1S62%EAsc!WigA_QUeUI(WowTY zLM)C1jQr1kTDJ=#^_2rQ$Bp_;)e+uO zRKj8=6u7B{Z0SGUPnvT~BZSINl!~V&_ zflQ`13zLhYOy;&Sebw0fr4F zWW9#T`Dg20BTi)kt*)bOgZjEJU#a8K7haWqZkNa#n}yJXWJ3OL@j?;gK77i zzBd15K5n!Z9qZB{o;#h!+lQ+8 zFm&_iv|sH1{Dn(O^CsS`pfqzDZ0CLQ^_d2BxmZ)|qVB0aVruDC6%K^% z6)l2lXuKncbE{Q&eDaxPp(E+fH01|>;axx4pix#(_->Qs^Y%&EUF%eERJD-5lB{A? z$)>!sPJBN*6v+5ANpW{_-}mSzj%lhcA-h0JbBN)DULo;TxTVj~QtHsq2R?EKV)1PN zvEuPjo1nroShUnx@H5Lj5G6e$=SXfzkl%`mSHr?^Pp&iRd%^UBoeB7;{i{SZMeN&J61nG_@c;QPGD7B7%6fUFL3-` z=k;EHE&&W52MmrU4W`@smoh+9w?7bG(q)eQU1{KTDwQHjiDlTB$DO1lnvmt1P-Tp} z%*q(~bd{{dSvn4if&XWo!uwf&CcdNkMnEsNyI#!?$aa3Jo%5WYMm>M)tVb|5yG|sX z!@MbR9wb_M@)8PE%hCwlT6v`&sVm93@tY~aZUI&9Q^6(ibIe2ecQ_~&&7O6}THXq( zHj6Ed1;~I7RCFj$(H?3_p!x&6)UxN(yo~R%&wQTQ53eEV&p|D^HK}59$ zBz7q{UBa~B!?ip97w!>tgJ;kY{`q1qYAHq^jM+wAVy`81=yMje70=IXcD@}!g9+W-p1GKL^oxDQ=eG2+z=NnW|BZ|2F&C2F&+Q0V- zv7E;&T&Tb?_w!-LE$GynHZLW-NN4J<)ngD;;$2~woUCB^zdRy;dP9yncUmz`4`v7* zI=)4qS9MQUy6?98OGdHM#eNs9Hzy?1$Th7rH(EYg(y>ARHA&X?ol_?mHNLLe~M$jtuN7Dq-{ z?=FXb=29Ab^bu|qSfzcnvnJ+G%5A5a@1^YY(d)=UV&D`?2SqEdDahf{S)Q;|9&HQa zh+Hf{m~Y=PAJI>U8wL2@#C`3i8>t9J?`kVN$+|%|evgbHN=Fe9el=T$Cz?JSYK&ii zSd{vS)E9io=$hrSfDQUS%DUpZF|VGtpBA=%RcNdw{ftWes7EW7-gWY7AB*h)pyU2Q zRZo2aBQbr_LKFK8e(7vm8q2rb8v%{PrP20Eo)|9(A8}?7%uDS z&4f4oTtkR7t(ZvvEc+w+cN?6pDZ*OGUhZ(}=vQUByMY{AcNq;~V6g{_96$trK><}i zjEvI?*87EAdl#n9pxc999G*B*zCs#f*7KN`-OZZhMlg7PkRe&;RO;^pYM(Sc-qJya@|{mINVyvv=cq_?%EgFh z1!H{3MK*XpG_E}N!X73Z;{@)1BI8e7(Y%&zeSZQ2%O%f%yMTb}>SVAH#=oIOQT4X( zQs=mv-!sFWzH@~VQM-S)R`X&HT>{ycA2>PNn4nlQarT%EPQQ!}gj0!sxP#2wuT{l@ z-Um_YG5boLFQfuCQ@%h@X(giF%GC)4{OSZB|x&_r>lbm%|P3X!# zl#A&;hznI|V1?5|u;kLRV{ z1|^u@=ZIZdkZ}+yh>jCw@+6Z#uNAS~z9tAoctaPQi>IGYoRndI)v9bcRCqX%>}SK= zFl27(O0W}YZ?@T#(s^Sq1Yx{ZB^-bz&2FyyfV_XH%y1@`z<-sOOI9<%K3N?;JDI+V zt-h_R&U29i@5JEAPF65C?zrlJok04LjdkM_a=aWYOeY-pDUi=x1}ou$B{OvLZW;B$8VU<(PzvIMJv`MTdoOjPDitTv!D{3oz|dk zUr~$crN=Wa6}Ym%ORbH;GO*yDf?Su6bC5MRZ`qc*{*JklB0xns4=#GBn{bTgt2Yioia^E%5K?_ zr2~1+Z|`*VZQSxD8`1TH;#sLBa|u2VO^2H|IwX)J>uEyBc}ERhkG~oZj+E1$fvF@v zz-s48W9@1LcW!j%=-pxOHp#wv;?9J2YYTW$>V7Rgh4pE1js32*Q;Fdc9MLvZtE@j* zCOcg8>~IHvd>}~pRu49zQHw^?v$(Q?F9FC9CcOP^wyBiyucJk8iD>VmAEDB{iSCE?k76Ltt%FR7m?b0~!p zk(jp;_@>du#`$Z07)!4?A^V|JYh}TIK}yW42lQ?D#=3^ZIk!k&`oOnf_U(q9lR9Wl;rcy1IU-c+bH|{PuYEgs?mNDhuZM+q3 z+AbA;V_+htB`+mC$o*5DE#PP5spUE4;j#o-L)i{>3IX~IUxh^)YXX6<+C?l5{Kc4# zC8Ex%A2r)AA9phOLWOmCI=E=;+E_JPw{gBQDH_pyk*mvKyFHV9RE|W__RFQ$ubGy~ z@J~1Z<&3DUz`R{~( zf!|VyN{`n~QLaqEV+XGZ6}D?w@6>5Ins6SNG0gPF(Kx?3m>yeQAL4F_SD@(};1fE0 zHBfMT!Bb@aEhZadUVDC4xAvPED>NJmh{cRi?39;?*3r*hjIMb2K%}jeV%q`>L|`~( zwwz=eR8vv03w(PKpwat9%tVVx6q}lV9)w@?ZS*0$VQRbIdbX5Q<$R{d8XM~LQgLQJ*_(ukd!9>wS1GoT z6@kO~+hO|p6D(hn)c z`m6%SRvD*u4mRanuJ7j4CX1*6U(;-Ub6mesLzCi*QX3b8?fqqH4O|&Q?TA>zy>EAX z^HO%yXe^Q?}zRMP+v9W4L%8!O8k(uOn{M@IX% z4aa6nK8%>mdH=;qv-}O8%<5JM<8m9o8VIfaeCcJC?^2LpSO$F;_{La&h--n?7K0%} z+RagcG7J)U)lXJ3ytH-scS|dPQd#ydvL`)W~h`~~Wlq@YS^ir+-H=vLd*`((|p4!WXIhjnzrLjuCS z-6>DUy%jX6$TAd(&WZ?snW>}5i@phesThv561z_A$;F&QBAO#q2&#HR5CcX-T<1h^ z=-N1awsYjJ*={jQwmb)~d3&2=!}0-psKEPrVQx&4S#Ph>(nj59$wXC;?j-+f_uVdf zCB&YyddjVc&96?{T7`uT5nVDtdIF59atl8=xf@v6QjPr<9eh!LX4zYv4H95xGY)if zCbgA%Q(W2xYYp`38MZ`7j;kQg4`{UB)fymvBaBG}QCM$5>KB!6|13I|fmWX|-G|%M z3NR2#L&FK9^l|Y9w~H9>$gbd-l(-NyxmeMmt5+@?UM7pZ(%wVAF?0t(d)(UkTHh+r zc)vrr1B^L~JhtM0+^k)dCw|Rgzu|3)Jz5r%L)0pAFPeRtj5YJ<@NS*XYV?05h?8V> zLC!nYpM|{P38FwltLL$7Th8A~$Tnj@sM;3Fom)7!vO~gJHJo=xAGjtaaYBsRZcd|Y zj|6AmRtJZ9{mR>ITrw|mU=xdlmpNH-i8x^h0*!?e=?!^*B&By!qS69l^~>8~ubZ2= zu;B7hb8HM_az1=v8RS)?3uBVDC|ZMTB&Mp9kJ3nGVzed8!$2)CZ*(sb*&3z;ks`-W zlcj+xrT?JGB#JNh{Sr%6cDWF(%)c7ML{paVV&)_uPGC#-igZGR(B z-icnp9tf|{2K@M;Ls-F4o*e~bpZUZ08ANMOur@0RXEpeuW2EQ&bMaN?#?Srj5n6@ z9RIcVR+#Pz>%?OBNt3%a+5;4UfOtc2bhipOb{$9RsDYd@IQ38I<6UL`;rPUkJwI-L zcE!FbK5P{qbt0h6JB=O>B=TqEol#P7;8z4WV#>8pVGP)y0#^c1w4Z#)vL3`WnuRGE zFBud+nR>pemhJfR4SNmYk*62-@8{d{V-4w5D^N9Q>-7)~=^`8ivRprZtRI~m zVZlpf=#^CI?K9reOkqa>sxbkSZ53F2g1ZSKca-OH17;cS&<)oQJu;WrAE=I{Dracj-*n`1P2G&5 zKsfI{(;QP!k=ukJQ+W7R9UX^%gYLtT#>~{xoLYnlsoeyhyeaW(MdP+yUAt8~6s}V| zZN*qA*xK(HeO7jt8bUdkZli%Da?r)+tJJ&NTv*~gTJ$Jnl@qd-V7+U+d3lzfLUMkZ zj=dpnSjS2IzOl10W2(A^rllv&9hk;{!LCrU=g~(9o7e{Oq@pn?^=Z<7=PqNdP!={I zNVI7QHiX|Ja;t^|d*Mx)TtRjZ)3UG6))TSSx0QF^nn1wm5BxqNV9nhT#^@65`DzQl?dz{5wP7TpJ$YmE70TSt>(xS5ie)(l}! zJu}SS;)5J&V^`&WMaP9rKgAQ>Iwzg!JkM3N6;@f`nF{K?wIg}6_D+n%E#tByxywkR zw|FRJKHhowih>bxo)$cb$U?tlw&&0EY2Nag5|#&lIhpqfKY_wl6WC{ZWIf!PHgpRt z?3=C=+%0eug6!;8=>~0ref@=TQv!2E4Ng6Cyo8fOI2Z7LE2g4WBp;hIkT86WQHyx= zttJ7J1y}0wp=o83*7ID4#UWC?j`K|BdDmDi9KYuGz_6( zwr;S3>etXcZ^VPOR_(yuIuLu$mD173Swb@k$&9=uY~8*`ZA*060D=%r&qyp zu2+ptH*9~WLPLBB%}s_`;pJFkjrnO3F65$Ps}I7@dZEDQF5aFTu{M;--KG3>K**dC zm$cqcex=`7)|L7o5e?Vdw+07S?w;>c9X-6iowj@Qv~!}>>D-RxnH4ik*m&zsI0ueL z3Y1oVG&Vj%3B?5P77|>>{v2NdL0+aOM1z?^`H|bM zjV~50pFyYtHfXV9lIyGZQc76*(xy_*@cFgiRW9WmC`#`BSog zF`7o0YLlcuNAT0krgp}eCm~dJef>pF6b1jRjaM1FW=o}}h|UXwgMF_^m&Wo_#xLh0 z2Y_~}YhN}>#jnK*-+8ywpzpd!E))tTF}Fio zVT(5n;8apdyTP@$Tg3!*9c_^&-o|fgu4KC=a$=2>H2K+b1SIZ^*X%g8sslu69Cy&) zDUF1^;Ike!2MIw@fhw^G$oyzc9Z_&&s(a<2?Dut9d1^RMd0l#F$_4`pxMU4~7`ezl zt=EzccgWz`0xQtI$o;5AB#Y^+vw#y1zahJsFwRVcwPVh_R;CRgXsEMwbd!%nd9@&Y zc#)%WmMZ4f&kYHkD|S(wshS(J9wZl@@-IYBlT?$2h_c2n$#>#KSk15cnm16dFOc{6 zll)fS@_c+0iQ~P`;rJ{3>T%wGWyG?Y);>GFxRB1)8W6`e#&@1$WF&IadKF9HP33-W2j$v^pDm7Hr{vlUVi}hg0(V-s^@n#J{qqez!;4LSG4r`i| zqi#2*yt(Q&`H1iv4&wppC6g zE$^pQve0@k&;qUcS&i@AMRf@S#%Y69=Pm0O1t+~It9C%`bjUlXA7jirXL4D!ZI`D)K?@L?opr*UN;B2fP&lz5twxboeJ-ao9iwHg5yLR zYZxr-M@)tFG_L=~W4|(8^_XKEpjv@Hy7K6WMP;pEvv+}JyQEEjF{c6l`4qWyhd8n@ zQk5E=>A6zwW|$FaomZhSy<<$;s(W#%Kzv#jzv{&^VJl>z@CF?3o$^6?UZnViqp?XS zMZw?o^3#OA`eK-(y^nsrG6FIGT=p|s1YObo8?l;1L@p^hOcwZ|AW>-<`T+BR1$F*)OZcf&srzjG|OIk!PsUCMu# zy9)DadT1`WrnI{`zz5yAGn8ibWtN{GZT+;Y?db7Tmm!jOGCBMc*Z*xh74c()Hbj~Qd9lbBUPteuXw30ngcf#%swqP|e ztdtmZ>;)45RY0o0-L7IdvnoZYgCzS*zbJstKXE4NiTK%8f0c}9iQGf5tr3P|>VCA8 zQ*#{+vQkXOSblF_jhHL|TIYwRe%m(=JSV6LD@UtW9Iu#88gE9y%xL(Pk^3!guW0_U z>{A)`7sFu4h{w*V>U2pxKY?J;yH04y_9eq${31?EkVKYve8(oW9 za(@Xa%WdJ*L!kUAOV9pDa^)EcpH;3-c};rZ`%mn#f3>~uHtUuVJtD^jCl{oz4G>IFDE1q~Lt+_(y{sSF7Nxd($pYIFkINZuie+2UM1@V&3B@6^&^35)N7MYEnbn;BQ z1hrDDhAYByjE5UD+5IRprR(g{dMBvzb(~O$-@&>#g0Cdv;!U5DgZ3M_t2=~t@MhpP zmW5&&i+i}n6Qbl_lpyK)*vb<=*Ewxg_rk26_|e?ogQ z3t5VAE9jeMQKbl*2Jyr!hLgyfkprjT(d0Xl{rA&cTOVKPMgnd?U^rbLWI*q^fW@Mf zH*(u*-!%BlN0?awvn%PN8X47odW9>5ASOwPuXqe_9x2S3*wU2KSkUyB!)R3PZ9yT& zO<1#+6j;Kg+Cy}$mcHFmcnIAof9*I$kM(kstez{gC0^Xk=$VzH`(5n59Sxb)r@8vaKAfpx?jAbQA#cpg9XH28aam*nVUon`l&nW@34(fCR#6e|so?4Jx?W zjEd=pa8ci-S3%kHtI{QGD`K1l15X9d@bw8_pc(xX(4v`{*7uw!{~WdTB!zA6OZ|I> z=Ebn+!H$7E+S-gnc-`ysNjIgGp;z@Al~xa zXsPGT{4_LuZGNCy8(bvpfABjKoW-EZcjE4cNK#J5rBOyIV|`KIQqci_6c@)tl78LY zx^rv`fv zFGUe(%Rp$nct``9*)x0t8}s*6tSbja5F^HCy70JkMk!WL)~5uJv;k#Oe{&0S-_l3>6>bi)?-&NzJmLgK%qF68o>cT|+b<$>#7&EUp>#75 z42MhU{)?Owua?1a5nLn@^H-nyg>4HH?OQ12U=u&*8{Q0~u<1f=7ewDOs}x>&xl#yh zcC52qq(3cd)zsL>3BFHfFxLS*;^i?|tISpk=7FpAeEqC62z++lg5JR`G7~#h>Twr-1qpLM6g`T`^D)9!GQoQ2zhPO?xx!UW%@T;V0VL#SQF zfKRF#e(=~5G;6rX*R90Xt{hbs;GK#P%I!bfAv>SS9`8{rJ4 z6RFk8T81vE$&aZtG{jDiLMEqjJ9w^RbxNUjqJiJi7cGc8B3^M^7|3YZu3G-9*fOhz z>|6OU4%HyNWctJSe}nVb&M&=>ZF-GY9cGSJsw-dAZmbzF#< zrAVRH2cD3Vi}+oz)zp&gywesph{v8}OG8qZqA+vDV~3>ZB&Fir1(M*$lb*?d`1sA+ z6C~#TFIRWnLp*%Dm%Jp_(kOAUz`MLszIc=!)KQ2nf`zo@3s#BP5=i&tFJAg_@Y4QI{lA zXWRqcGP0t%FNbvDb(aETc_W{7{2yl2dPCxO5vZNq=W?Nz&mFnV5Im>byxU5*i}xil z1D-1wsu3CY`ImKNsokft(m7%&f00s8H185dt}%PriMn(cCL0$z*|Z2(qB~EET3=WC z-uAYN`bnFye;v@F>2C6^qQaAtl#i{B)Bhk9J|S(oEA>Dx|GAcVTHG=n;juvT>MbtS z(MXY?EAV11FlBc~NecHAFF$R~l@_sjugjykKxNY(i_^C1Ju+L6R0pEje=6MT7~SVq zTUQ6djU&?m`2~5AJ8MCl6+6GgsDrfz_lAX#DR1=M>n@&qV?W^Om5`-mzuG$LLE5pj zhhn|zkS3)DjKke8+)q#!W4>SS}v^^e!KD7W?he@_5!43P6}fKv&E z|E7KY?hU?r)8te$yu90hCll5Y10k1GeD3n>|702pOGP&&<=pWJR97cMd@e)dVQ+M)xY8 zfVX7Hq1Q36=$k8oL6Cv0hyqDD7<+U%s3%|cp-7sR@!@Y*OpY!k9jhqlw7Qa ztgk#0Pbd)`6h=!VUe{#f;Qa!$6{J`pf13hqLvxz|JIete*1LQ9 zU1wF?yERva<(rYjNh54$g@z09UbyopZjN~v+ymd4jOdCs7PB}6ZfNKKKeG;{ ztI4)x>$AA{H+OI$>Az%y{G}gXD>7r*TsR$0?dCs=YRU@j3~^x?Jti=}RR+&#qn#!) zKQJYiu05i_f0H=ur=7=y7ZTk^CS@|#;p}c^(gT;oZ|XOCdYNMn!B?L{c^jHIjDFJ= z>YlUB31U2adKLE`Sw|XhJ8s-G{~Tv(Sse9Q*nTNI2fJcfg4eHIDOm@S1_Lbut%PeLldDga) z49C8`-AD7IH!8yCczkmYXLc>9qLasYygum8G{;ufc{&V%U@{Wq_p#DuingC8 z2LS`JeKBNGqK;Y*Do`Mdl;7m202 z*1P&*$R@D90c9A!J;0a0Oj;V&4T-iM$Fe=_2t(kA)afHPZAij2DG1s9QR4EcZ| zw(ehr^4yZnVmZ18`xrR>Ju3C~pzjBzranGY$E5iWP9wWH+8Qv4 zHE_^4z+)fg^H^<$&xrcDjMQvpL|m*oC}FS`uZEEMIu~B$z7@1}qZ;`fJkYOppA?vu ze}S*Am8E-xSK*T59DvJwE|XI%Fu@}YU{QoyZb5^hbyT|RmzQjx1ai~Gkyn1{V=}F4`0#@wU)KUIM8`6&f1D^h zX^4Zs)f*tj@dxfk)!n`Ib9~(n<>g~x%_pOg3Ngec#lVH!zk&!c-K_==QW|HV$ZOGr z$loXZ7f8XI@rwZ;_uhh7Vp}9WP&8P`-loINxd?@tHdh*@T~v_>@%D|2Lj{B8z|?kK z9}=vPj%Ch!0_?uUNZF~UFc&!Vf3E@N{abs}q0(jO^0_J0tO5P4k^o~?r#~)eF zE$i_71`m^T#mCzG;rP!$?F&W2#6k8FpAks2c5LSAvbaiF@V&v`ky^b!U%E(*_j(`a z*?RJ7^YTC{OAJLoRwQcYBxYGkm3zje;pYuyWY`! z#p_lP-4HdEcLIp%D&dn@0^N{Y$`yU0>kqQ_Emc)#51eNrsilH=F_IklrWweP>@{w2 zJY#?EWq?ZA^xwoSJWk&AsHyYM&*xA1`uu#jmgtB~ki#bNCwE|D_Ola#0J6`i!mlkS z{VTW>zw@LkYwiAnL4{uae|O~;Yb~z+h>9`#7uv40=hG`$feHUETe=YC{v5UUI=L%- zeX$iWt}hyxglSN@;&4S6`e&F#J#p39;l&|=F!g)>)73wD&iubqt~ssEAW+CjNSigHxXuo7^|n=?4)fo(aF7b+T5JbSt^fYUH%e9R=0_LgyWvWc(qc zR!51b1Dvz~W=wnL%f4CSc;Pv}U9}S)i+|Ol@JAuR``&R_Pn$|xpsZc$ovrow9xhsJV$p*zZ z!EZf|kR!y*8ljv;o@l-m% z{Mg672R~ekOxmfOu1GOwmtSiddpTFAcol6Pxvk?zEZ=9Yf959}jMas2aNE#Yqd{g6 zC3mOt=4+WKCP>t!iR-qys_N!$y9L4O4pl4Qu#wIxSJdK{`OWd5J7@7I`AO9~Nf~os zn=Hmgxp7wzP^4|Gc?3Ra|GAC@WY5ER3Jl&#n~S-AgpctTGhB)jH^v9=xemev z`WK}$mgONyR(QUGHJXfN{iT%(?Lq8jqSel}ab(S0uSbN-v0J`wd36*U)Z|>dgBXOI zeC=O4=L+7ZC|3F*3MOj?1m=C!HE{xC0PHU#C>*wze~F|ij^1~mi^Ahk{)o>JOVSPs z*)iqOh`;R8@~-|qb4i&6D?c=eW=h>tYL|lNY%tQeqEXm*k z#13ho1ryS|7IVT!@O+EhO!HDZkapsym>*v-imwLAv`R-Y(TxpyiLyCGWoR6aCnkE= zJbdF?f93pb3%%p9gKP3%X{mQRSO(W^%4m{!@SAy`8JVL??+-6il*?nflUigpqwf`?!cNHTx__~3@;dPDi3YecH`}O$6x)1x?-j zsnp7_TZ$Q%KGrFeA>nsDO0F4eCYa_~SJ&HO9~#O$empbM#zkV9nffKprwk>iqWCo3 z-+zV2haa?m|DD6e%_DK)-*ai#TFHj`f9y~K>MEcPWEGOUi+ZCCvwO>j9J2s6vt~;K zg44CY9zW^(N!)!>f11viWlu55*fKf%O{L4apKO-B*@(NIza6Dv_@15&`;zAa1G&vN zsmYDFv3>du6Ecj@$Ixu=;6o8FigD$2{23F8+df9>)+%-w&GAtS>Zzc}vR8V{f1b77 zD4qhMFy&c4(vfw`1LK-!zR%b0K?@I^k*%n?E(Ae-oBM5(#P83O|k1O95*9TjOGBLEi!h1TLjVj2M6B zPSKu}?AC zml}*Vg_OrGHsu%L`FDn>EtX+UP*J%)9>EJMW({|#2mw>@N8psxj*|1je^?M`Kf3m< z_RShg;CpLq)=|ccYA!~6!u?)<6)UaMo(924vDTZfD4nw^;O}Xx?TwX%Oxx%=PG<)j z=~G;kdcR-zR@-X|V`~Mh(qIny9&#EU?Hsp?*lc131NF67!KFs{;_GM{bcoZC#RUR( zSC$Hq$_Pi*zVJ!eWrNYd5-AO6=016xtufq#W^+9aShRIX5^8FHB`_XLM*X zATcvDI5!F}Ol59obZ9dmFbXeBWo~D5Xdp2(HZU@ia8)RO_ytg$Thj#!!Vy>gp`a0A^-3MrLMK1PTgukc$oQKXwEPO`wxA$li|k9}F=k zpoz=7P29wP<(;TtZwHWdwE?iO0a!SBS-5zanE|ZK%sl@g+B@+A#7*2l<^TmofULb8 z&>4Y3%-+G%31s=n<$az1d<9UO(EwO@c(~~Q4hM+X0-Zo+CUyV?6PHgw+xHdCOl$yZ z_GTcUi|7BOpyvPN;^M%|#N_Vo&S+xm%xLdqDM&+q4{!&$d;+KfoqtzI+)mbg6u2-79blSKuJ=b(Z$1s9$;c; z{uj}I#KzhFJ>JC41Y~1k`cC*;x(Ps1LzdWth7hF=dj0pC!Fz|9+uPZA{-OVU#7tV6x-!ygbpI~-zfMt6dk=s&13Q3$m5mL5 zz`@1^;N;;1`2KGeWfRc9Rs17T#?HbX!1FJ$@4NIL!EXQV0P25_2MyqVb1B-t_bm`W z{in_KnK_uv-rrdMKR5f|A^-n+{I4wkuWkN+6-m0<*!=CM{ukl@$8TZ_vhn;k@02myOAJgNstH7Tp8qI_UZ55 z_?KDZuQ{^;*#VX9ok4%yEC2=;X6FCndmk<{>-XEk`Mo9ob^+gK=YLm}ursqa|7*xt zIXD3(PEIDC2+Z%D#LB?|@Md|RMsuLY-%Si)Vzje&c@F`+SLX|`uy;cE>qt3&IRQ+f zf0_P;xByIIe-Jl-N&FAu0WeAY7jdxwn56z7HUN{%AM_rl_y@hmDg765GXt2E{~!(k zlg57$&->~of6#jt(?94vi`jqCUpvKQ{tx)B2KYzmU&zt*y_xka@)H){UhL=V)+lq4q*C!^k2ySF8h9MV@ekwO2KonlSO3rT-{sr>f$!tV z^pDv03xUc0Pxg22`y=BY?ss(#f86gU=I}l__J8VUe;4augsilmJ7qpj~rP~QlzlDv4Iwl%_rwI```Jo!8uC8*BscOWd!;0#X} zpNcdLeSjxoP!~S%d2#fA(G0YPYyfGtkRKt;nW&TQtQ7)!XR!(`WoujT151%ub` zqtP3~UzOBYV|2VccxC@tFrk~yE%)T*?oQO?X_Gu%BEw;d(L!Qnmhe*-y9xp&Ba%Le zE3bHQtU|7i_x_`>?3#Iv@N9?V8Th0^*x_BDQANR%Ir>L`OS<~UQ-PKv!P7{ndEJ~a z6aUthG%cb?#T%?hX$~Ex)jfXQ`r+cE;)1ZDByb!;iODZ|-=-+@V`LEf*^d_PZjqyZ zz@(z$eR8n2XbD%;tInpt5q%yTzp9n&7qV2%kSlL6h?tD$SDRA~YVCbPjjQ@1a8t3X zH$gOzv|{jo^`l?&T2w`mgiS$QWabmz499!3q7sC5)raC92O$!7nSKO*(r!r1K>QD z=W{uKsC31!Gn;`+XUs3KU%2X>A-n5z4Wwq~?&q*}>bcB(9~)0yEsAc;7W#CZ5!q^* zz4J=D@&ST80Kt6ti*Y15K>0J$=d1|cKs2(T{L6Bh*m56@+4JG@(`XA^n1@PC@up&z zd$f7;wEoh3fvT)*j>+Gh{&a-R#lNh$`Y29+LBeloIJ=mm=rs7?v*F*=nZRZo!wEFj zXC#m7?&8~6l3gUGgTA+H6bd8H76#z5;13klOB$*Qr84BCoH@j97hAv&gsXkpyWxcD6yCE*dIL)AW7ad*Oh z5iV)wFJNcY&AQvQ0JCU@brkQ;f)w+7KXJqsV45dm5SAU;=h<4%8$0m>*&rXf^tU)nE!-I6>Q#M6~Gs+9QZP!w<_?^((8hyjzGEUnSN zr~7F>j*aYMWm}Q?QM!E1rE#x~@uI{mGaQmIJYzOPbrgg=Wj6#$g0Elq`cA464DM7_ zvns^Ch(Q_Fa=h?-ycYSnLnkyMdP;t{K0yVSC76evEwQbvNYATq>d7X^q@v7!ePTtiM#EvU)VR9(dN)&nrw-FY5}N99hSC#gWJl8e2t#j zsxqcCrAKIW1PWuyGa~O0YCO3iUPiGq(4-tsVULIO5Cf*$uEg^^yN3}6ark~*{!06E zWac3Kb#AHot_=s3={M62NBUIhTSkCV($^#Ip&fWdMY}uQL8&WfSVz}M;+B!RMTKOD z?(06wr{j5ZCP)ZISo@!Uezb#KsTe!$-KgU|KrDy*Z%qg+zv_MMQ3K?_&Ztw*y+sIS zoE7#y7z+W=k#d{9`QoGbU|eNmfMJvweqSx6ep+Seam!Nx*M43Wv5F_vg!&z9%^`3y zOqr!qa0Vh?asM6q%?Gy(FDKMZtTsv+=(Q&aRks;)Y%CePD+{ZCMA}>V>NQ+2GEgMJ zNW)zulI>zlmo{a8vT~9PS5u7~5bT9zd_d~+>R)X_TQg~XsDT&{1xv%|B181aF=6R7 zoS2pwGT-r6Kzm(4t>9sZBKC*|{1DqP$!Od*7wTXDkCIEX9-B|h1&0$ee3M{)Uop{~wY@AaR?^;k)x=I+UaK1(6UQnGJeOfA_*(p25LVB_ zQ^njd#oMuQ!@6h_;j!SwaQs|b-nH@$o9F{~nlu!c7&}MS^tT4M)Mbw#$DVFHW z!xF8{;mTKk*s_&3&2^;mbVXMfI-a*xRCX@(C{HO=>e>F+tu5rTK;-`3NV00L@m&;q zrGBGipDHumS0XRb;)7t}7To|5l8x?%ZDErlphxOHp{Y>(p&;25b_)307_3h3=ks6L zi?|WT>Lr$6sX0&#bUJ&+U4Rrd?D{mdo$cyG<3S zyQ&=7xAAUV30(SqXp(6BRz|(2HvVmrnK7w8a7@C2pVfHL2;`&5d(?zIJIO z1iRrK7&F%Jty&Kf?I{?nd=^q6-fiq1^tRM}G#rp zr`|&%WE7dR6Bvv*{J7ONBSCbyKx7uR6E$<_YcRZh9r9nFjcJPet8Zop@s@a@BXNH9 zq@Wuh-m795%t*@As9SqnWLa67=1M1jnZq$s^CI-^o>yiQF-Ffoq)`+*IQCRr3*z`! z2^t=IdC;$V8zxSg88~I_a@yClgf_L^6pU_Gxkx9euZz7*F9A}N_+}h;0}Z$z$gwW0wk1poUg*aJa;EG;m%^jJ4rB6W| zQht;E%l)mnK)?xW)Q5TVN+JyK$UOs7XD_OrQQrM&6D^U(iVQ(^JUWCIh!PIhEdg0# z^{dC~B#(K-A9rN4Q*`?g^#WG{6zFTZ7R zH=gyf^u55Cf0UJH!4bk#XdD&7Bpb>yEuB#5Lm@ivZnH3JnVHLTYV4|iMdY`%@ryZ6 zOW@F{m75j_RNKEaI5kHQ4lfQcg5_9{cHRzpx>?5_K4H4rZI7C;)+E|+J+S=YpZrn~ zwfGComx|pcWhQxF_@TDPNJ(a^jSD|(n#nuf5X#Gw!Q5#Q&jZp81BA?jFD#rcz2Dv>@)VX<<&v@>}^m< zn2zyQ8!d3zK!+~I@me9Fn6@Ryubr^hYrUO+(=$-xKx{@hG)<>}D3mZy+@Ss2maS_} z=oIIZuhKET1f#lXlP>x~n_nin5nH5YVdF3Se3(W;cGY9DxDPnR^)pQ;7dt2@$mRB- zvT3neeUv@eZ9ZDM)DIyiGt%<#W5NevQ(Wds14RepA&y#6p(Y7e__gcRxVgN}xD>E% z#I$4z{1Y(5>7K`bOS@J)iZc4p>BAf`pB6Sy8T6APAAZOao6ckR(brlk*->$0k3El8 zRM(K=5=hL4Qt^Fw{mg;`2-^$I9`~oh|81f>46r#QDx0gi<$541m@*Ae|1O+Lrb3wP z?l5JQg`KqUNt@-afsg8@YzDf~u*4CSf13LIxBp`whg2_rVhE5M%_!bv$`Fx|#0cIz zjn1Z0oy?mU#QD|8%}G;f4W}>^4m{ohCiO5csNz%bmL*e$M@PX|#o3?3a~iBdSC(SG zuv7vou|7H=`0B{O%IO6?429Eo&)tcnpQ|cQkiaTigZS^X)WP?YisO;DPmRM}rj)`H zm4`EJCgN^?)xBx;nbi>|i77@!SNHQ3^O5R>pqZ1?9Pr(fEmY}R%mkD{5O2!1k10+<*JfSh|1swYOs! zwZi5Q<))tSA!83PdRKN8qQ;4W^ALG(0WAL2*F(w%9snn*qi%_o4h6FcQ|F7LUZ8H0 zJ{2Amo>w^yVS$}BZ<$xpG5>m@U1CJ%lFU-TN+Ys};=NUNf4*lHy!16cp|Z?NnO5an zd79dPlIp%WEymUV)p{r&!x#q0?QYKFNP?H-jt={+7VNSbk@R@F>`rO!F^%6F7DaZc zsa_AdlF-2A5Fd7)s(aX0+lWLpN0AU&uGYR!0N+B54Ck*Hh-7UDDt+_FBe{obXJ6K6 zd~X#I8g~NO6Bk(R?O1FOn`^Wx1{;KbdX^(cf~gO&-|1r_|1s-RFBiV4i0|jXybOXb z^4jxCoQF?LXjBl!yd=}}mt7EzJ&g%v#O>hNm)w@oGT`JlQuHZr6AH$jzEUS8*|_6E zG-iqG(3`(gGvBy&pm;3t;%lC3~EIaoNZOu-x=6yy0KCI5 zN?~72?7A?bCi#Z>JDF1^p>x)hnpn*Ngh2my&vB$Dz4@nkLw-_*5*G5W9P-JV3h3xE z41%vX3&}FSz*xe}KW1ad^8BLAkcor*{IL!&J!BcX)oe!FA&L%8S5}RGy)LO`XCA8H zJFTkYnqjI8*%DOkmU(CQe4R#p&X*`v!hcxooSLE$|g zKCzN5Far}O&bA@;3=?hBS5uGkX$ z7Giu7RJ$EN2%%q8QazTV9)PPoF51g1YU&Tdt{H$9)h$P(V)TrEwsf0}&a6U-YKmTy zIf`q`s1&)~Q5C`|VCf`y-J%g9$H_trfOFgE*+Eso6fnaw`e zdjF{8=g)FGyIXJF!kO?|ybj}kM4SY>OdXvm+5qF3aDv%?#A^y9%Qg*iY0el=C-*#_ zsaPz~JOdbLrQ-E3exDaC+Sp@;%tgQPCs8gIO>EJk{h%v0Hk=iylzXBU7yy6D+;AMQ zU=oMh9yGPs+YYm@7E;GCm2$`6%M}p>H$RNC!qAsRi`yzARukvVGTIg@5}CfGJJj$4 zwg%rxDS=vl3J#ILE**n&j|8hTu8;?`$FXIKF8-o;S5HDr**+uSLo7i97I=O8v^Qn_-kM{6Uc3?bg@FFdKW$J$70+-DfxISD8WIXYFc?AF-MZ5LGJPIOr=nJ-}nTr?pxTeg%@@>&xr z^T>PhTuC(54VJe7ErFp+zY``Cn#mZ$dpqAO0O4p7SueD^2&i9i6~=Fac)5sbZ>$Jx zv(h1dkt#@=s7j!Eu-Mu>um%bF^_&2|epEvJipa-5wTs^{%y4cW5Z!a(U)B0}+MwmU2G5r0+m9A_;Ab$cDDpDvL0W?J(<1qT;`MvR;K=<~ z_x=?ke$pc zWUmPh%}zbY6(RV}!w{ZWWqtI7rv5o;#6b7wgajhLA`C}Im&wzY!y}|{VseDvqm#g6TwykP5$Za(z4_@}H67EOA7+q|}4i!Ic!QNAH(;_@o zX5b(H-D3&+gU&oMHJ%a}r0it}_dIuhv!CrWKej;@gM~suGA@w7BU&#S9&T}D zD&TOe7H7h`1DdivYBoYBYlPpoRBFg;u=lHmEF%A zRL|XCAem-qu7w96euD`jHjR3kdlh zy+~hWYT(br{g>LimnwOZJ@3h#qwFc(x?<&R!X`ihXUb*ptjkHaBW(z}#@UyTt2X&X zznZnIS`K$;71l}xWS0$WApM3gGxr~S`u(YeA35!b$%03x7#~hE7ebPMbhRx(ZX4g5 zcko_5i1FdiE;IAw=^Q4U3q*-n3N~&}Cd^0Gb&^|oO_X4jTYYqy6|R|;B%CqjFs6(wvm4$UNLni z*62Nu+%|5z(QBU#?ut#O>nsI+nclmxU8gE@N9|FW?{5Bo0BMdf;`MQKfK#D5R@fQe zojwadB-22m`MDi4&zA@NI+hT~yOrB#x#+prsD+E2X(unK?3=?~c3c*AJS^DO!0=G{ zW+H^`ZQEq2k=by1BNuqh02ygB$}dUO1h;$!pFq@=+ZyB}O9U%k{lZC~FL#@a;YR$p zyAV|dX>Wpm*tF3@H@vu|Ww(ViQ=neGAHeEVJR^9GD&>J2{~2foC)DiM*)%c`f6e(# z9^Ur*mR#!lFY!5Z&lP-rpVVmmW}C0wt@}Q~Xm#r;r`l>jlnP2-Vv^sH2lg6B`p(Pc zHF(oY4tw&YQf4mYd^s0Fo3*Sp+F*wz4Nx0y+mSMVG;f{Z8$oaLCIEB=q2rHA@yCVI zLV8&0l=i=hqC7vQcwx^H_JLoxaCwgns$gNl7ileaI1{CH4>3Q(Z2QfOdoQCHTROM3 z%ch-boQdaJy+6H>M0weVrhm6>tQlyoR#7#yG>3Dj7F>uXd(2Nj25c5U=K)u=^FqxU zN&32fA^c-Rq6phYAa(dPd0r-u9F2PDm1?I1zSp&u3qaDtsS-`U?(|3F8=g)F%D(|X zeuS1iJ)G2;GHOjsYNUSFtS#U`F40@ijii<JvTVFeWjt>s!G^o{P4W$yl~kN zw$2Ci37Rpcf~#`9dQWQ9*3`B87FdiVX$-TYEiUmJA@2 zU}bW3?%#{XC`WSy0{9(=N1nWe$7O`W<_TU+wTMO1EV~XQ;Xdi~9<>5ohY@O0Fb~Cl z^X9;A(Fv5%1BPrbRrL1ek34Iz=2^butPFX3M`o;B8N~D1>U*J$c~HWX>on7xyNDB% zDvHWjSMUuPVbOdBr(Fvia!B-PFVTx5&*)(#z$aVcA+oOi}xN(2B%fpMWIElmF^3)bX~|Bck&{{XASEke8js3w!DYv z>8O+15s#>B86Sc?LESO^+zVZ2E4g1Jqs)pLeg{Z!lH9EE$fit+(vl&6V?1)@EK!m* z^0=|w2b=~f-pBCvNtR{@t&_rIHa-vLCMS$E`)=hJ&Qv;_=R;dy=c9Qf1z0FV%SIM) zkk0H3pM?VAhkqRcmt>PN9`hUHvt9dDHqaxb@~ft?r3WinGy17sH6M$Vv@q;`vbD}Q14u0Rq@NBB4qIDII zF50S$WBt|(fT6ZzM--7BX8YD;#=0J@ywHu6CMXNn7S|PlPjAx8+HPuBm11j_$K^zx z>;f0{Vp`Qwi#gUo7*u$GO&iw7n1fgO7-adarY_CiGem6!{xR;&ilon$HJ-JyfOk$-t#RN&Y+_gqQT%tiBpm}L=7q3FnHUPg{m1ZJx=gt#6vl7W;dq0d-$yfxZN3}}bQUcp z%m^5Mn=0ulfC(Oddr9n~7>MV-^c~R%5urv#cQW8$rA^c5*pvdR(`_EB(k-s`fru$W z>#S}K|7A1M@q%D_LOW}++M53cW!{AZP#9Lf6M%eJNK{2`(E4-;8Dybs!O~i8h8uq4UH78hifk0>A|D<)A}Z;~Rr=>23(5z&C)?~|sUcgfB66j(vEVJD4UBaUnn zv;9^Xeu~)6If8djh+XXG)OwOOZ)-l>P+#nY8>|>sA#ShXFzs zz3CPK5+Q8L=+e~bf1DHoFHSg@2We!dwrN>P_nn!Je^Ip>owlKmq9b|SiN7bT;a{Pr z<W<3VD*4h z`CY`E8v%tPX1&%*!`+NNi&HiAA|NT=pQ@(yaZCwXnZ@H-ViiFY=5}D? z>Y6S-l!j>~SyB3@(XhQP;xx`|A`|qtFt15%obs^Dn68qrL1=;}z&Qiemm)0EF0WFs z31_M`(mRucfYuT1qc< zF`td={4Pd|Dq1PKx%oVgI=@gSithC)CW0hrykArGe^BVZ4+Xh**2dYa%jC#Qha}0p zj&eAVp9N7tJ%E%S2ZC@v>6e z4NceY!H}4Cmix59Gn5}Mn{&0eEsNHnbg--2=`xV9S09I`XATr~aVJz}me{K2JU^85c#OHVhhzZdJ_Wa4vBj2i{ zq#K=gVcfXWHijq5r9W2^Tc^T`IW^}Aw{fLu!3~ih*y+#xCT}rM&DDNh;9nx_;Nr1* zME`|)Nr)`xl%8osRoGhSL=(p5A}LAcJ5W)MiT61;A)#bBk;Ec0a^}lnO`7Ujzy8*k zf0K9!7G4ys5*deyrXO`cO67^noq@sEz)d#`C)q)r$r+>b73BO7ZVe8NlXC?Uq zL69VL)t4SD+=XcU>mE$krE(eL^sFC?SO;T`p!wJM0>E^yoK)!6MjIJ}TF!3byUD6f z1on)YNw&TekC5g3MpCP*>EhCCf1@h&e?%8w2RveE`U&&Ztj;YbVFoaGYxwM>bPs)= zwc0CuZvP6IKyf5oC@7{h`9*AiM_*f`%Um~1Zc1@%UvXT$fJToGkeZsJgJxRqF1i&+ zkVTScut6<5PWAIs>=vC(>x4&7vrZ&NqMGKp>H%w|dZT0g#C^Mx#S$ruwKmw%e@Na@ z-*HrxD@T%o2_5E_PE4d+X!GOjhD{PLamq)v|#jsAn#|(0a;VC)G zK9q=~k3G8eXZzvEfg!9)x5cXef6~ba&!{)}Ou~JJ;+EQ2-Afow>Z{D^^j&FHHA}9Z z^Yn4=R8_XvT7=j^PpKcwnKaFRYn{}1Hh`}o3a*e8;M^I~Q;O(f5@f49_Ppsv z12^(zV3W%3<4sgDIVqlyn*tvAkZM;{lX#QYbR~_Ac9Cboo-vZ>cMoIbe@~e%nopzn zx>Yn=)pPP+`gmKwgMXwkw&d6-K=|D~Po&}n%Z^kQTn9&w46Ai~I^wkiW@05cJE`o7 z4%}HjW#7cRA$UG(S>x=Q_K=DjD+P;Ul8?<^F45o~E$V`5y<5hX<2zw9iM9vwQ@tv2 zn*yWsW(usi!4!7J+_$sZyG&3tERM}`0({Q6YeSm7_;32hO8@Ee|gjRhwC5m6}> zt&=0%(_{j7Ram7_ zvRG~BzP#zm%pQN3RtkdeOM>G0_a=ufIGCRF=Z<6**^0U<^+zbf#J68Dta7>YIa3x7 zQuIedrPbVl9ubWp_lUcJ#*4yyVdW=)~ z@wZS0jbI6t?yWAte}2DY2lMaN4{k1vO4o(fLMMTL5LdxW6Na#oi7s{W3Bh=?)suNz z&?g!~Qu)$aj>P(Tt#duLCTFUPN~qAF@gUe|-_AJRwSIOGWjg~7RBH(@$5@D+Ld;LC zpGqF7kW?ibysAV{@5#mBB-H<}071=9Q!l`xr#mMRkyA||w~V4NnqT5$MWU-b(j zf;mkBQWns~?Y?U~O07)5l3*)K_w@Psy&V?P+D)}PFIFVJo-$KN0a_8UOFe$coiH7Z zpm)s4=2_BQ!aI$KqRo0#yYSVX`x%^9e^}WQKk6LGfAC7Ov!a&(B&|Kua*Fd`4{Cjp z9N)Zb{w6Y}pu2bN`;Z`KJSG0zS={i08>3Wv1>yI$#{#rib5g~MGoc27(wm+ora!fZ z?|e@+w`-}H?@}GVp|Cz!ob~TR6T8aV#0cJS#FnIPgS2d@>jwjCKH^b&4hB1c5|Jw0 z*8Y-Of3_89Z@L>sp#Z+HNGWmYdgFv1J2-==vKeR-(Pvln9n66wAG;w^X1|bUe+%uq zfGIZ$r}gxChE3oWrTSw3s^pdUA^h!y;YT8(cq*son=7s}XuZqN&&w@-EYi_8netu= z(y`UY?rQD}?p|ay_?SSELcPKibpF{YqJ?Zwe~#S{G^X!z#%o$^<_!Or4TkIi;0-()utZc~L=u>JCmPdA?b@gWjDj7fTle(;y@{@Fwz(RB0y z`~bh7x@}+C9MoZtJTHMmk0XxXnW00tPg0RAy9cfvYMgOsH08tA*Jx$};u}+ue+}X0 z`5mNsXaDW%!ae4lIg>_`(c&}xahvL;Ce_7()S4j%mkvv*Cp4mR=)NjC{vRU==mTDG z@kg^NX7nmtl(*m$jN%MPX+8&sf;<2f=0+dYmkw1SlHCR%4w&Ow(fLs!UcNV3CKF}G zn*KE0&Nn|tOc&~ioxK|lzV1Wie`m8HQDbR`=fg@^0NQ;u?#+YMmFtc%gHzPX_uEX9 zT;CQS9B+N7x$Q-Ttsp4bs72y8kB82M!;YLuVNU+(g>Lh;gh*;+41%BFo{Vi*=v0Sb zZuC?-!?_?G?ZeNysu<)4pdh66+PMN~i6VcatHXf+UhnH6($`aD(xh!8f4YR3QtiO) zotjmxR53`z?YdXXUblRH16B$vY_t}Y`i#fE#G0m8WnHA~G#Yl^ocr9o#oGoCBDX;u zVmCaWa&`ACJR{v<1&&aA)MKw>_T+BsjDXe^I0Ptn5hH9;sWf5Qa7vJ_omKXctsr*p zmGU#Kb)YBrA#UR-CjMP0f8l4^+=6XEiz{ z@-j=-GmVO`4Rk@O7B_u!W~mL!=mmQrM`7BldWNPbSqNgWYV~>Z+>2A>=zA=w(AL9Y z_XGkxv^}JT=#ZEP^_jj{@V2RoT*ee&o!Ck7@F9H za%iwOUmNxKmTJ+BXm><^q}RzgE#08jwG}IdTki`WiLLnwPEB` zwKf(DuRe;FmYlhXyKTcLWU|G>zQrImyJS4IbmSR;Lp5By!R>r6!>?9LEU zN{vP$tBc4qXXhbI$h1d8?c-N}v0)H_&Dm!f>4FBAbxd0eK5yT;OzD2g;I73$xbTUk zdb{5(=|&8WnH@chRUh?!{^E>=gUdqqkFt|&n)Y_Is25bnf5n&D{7J%VRB!|>wZO$7 z5jyypd5HA3?QLPI5|GM zF9L$u2R8NZf1~^q0TnDP59&BMkWvb)#>zVy=KQ8VcL2fy@p(91_e=wrY1c?gS;A|? zxp-q-oG&n0JhMSLH1)zy7+cAVl5nbmeT6Lu4+|AFkEtG6ynN>(v$RE!fxSeio4XwqckbxEl5NjPwX}|o=IHu_%%i`Ae>vkzYow~7y=v)J&9O*51WigW zA@aT)tIYY4!s(&;2iyKk_V0?%pV)i#%X|iQ&k$CmNG%*+-zrYHY24Jem2Qx2x;Ds- zVd)Ul7&UCcsX87D08c+HLqSI9W?`58Vku27JBNCjY+_by4JzNe7*cq@+0IWAugvG2 zONyuse`&sml|fbL-6Y8jA}j+!%btoFMhP7POE|^0qS~5*+c}ZKeb$0??>T2i6N2_v zN#Gn`h`&B9ewj#q)ewb#fcG9i+Z+34SaGYuP9t1Uqn&XaLx*#{x)?!?lDIJ8H7Y{q zH)<8RR(VxN0PPf$DTL>r8D|cs(xhOuXX_3zBj;#jvKk z(RCJBGtSYq*n%^7NwmZ1qK9w+Amld1n6#w~r3|@gO+Ow~4~L|c>T?Fx=p}_Ik8r2N zf4yRnw{kZKEsjYWfFV^24}INjE`HQZNCLvLY?5*{!$E8>7aK6Z4~EFWY-!ZQ`rlDC zGE9R>R6d{MDHbdfAXs0&=vzzBzM-?mQNM(2nj)GLo|5kotXVboweu!P#7CNKN1$#_ zk)#VR{chevwjnTBxCFksjXP!UH9=yxe=afeA=(e3IE)9ogPHMTR|&Mi{U(Gp{-!5( z_Is1o8>3z}h3BZxFi0=Ypf>Epi`{`xO;qIDR=J28ma(@<$b;SR%M9(zf_p4c_q+nZg;xg`LPOhd#4ZD zE80rhm~?l6psw|H;2izv5Q7Sle}?_Y`i4haJ%f!MI4l9lAahz<`T9j)qDw-_G1lc^ zPFZ!>80_m3ToUJY$q4qXqKTHtIEtXq{65NeN6`k(M{z-XOOO;y(5~U8yY&3nk>x?N z1E!2!_*0HdC$`tvXmuXXb+ae#QfZjyx8w1y)%97m{|DrxeAWtOZe(+Gv$%_S1`9DU zGB7zYH!wCaGm|`$7z;5mGB7zYH!wCaGm~GE$OADTAd_%aCx7i$U2oeu41M>n;Nu;J zhL-#d6a$)cX^Y-&U6MA#f;_m6niz0m&sMs0zka0THBP!=xO=`2k|l~fJUlXJ2jGGa zNelv{@Ilg?fCp0gAjlL-5E6pe1;K`hrVvCB_z;1J!@!9_x-P9<3Jl0AVF2lzo;o;r zt`y*Hx>ODh%72Yk8{I%i`9=O$Pq^H7C*3Z3Rk|Qrx6N<-WPhj>PTR~+S}-? zJGW=72X`v>80}RAhu+r1O9YkTB?ix4NaxfRVApmwndhCZ#9iQAHCdfkx~y$i_>ivY z_xhy9%UW+raztdE}TfCjDMT3Ce)qInvAnAdQ(T}zgM{kqHZAZme^U5JPinWg#1*cdtCiYdjs>o9 zgMTHK8-*2C++PdzsO_pY5S6O)a)DK`ZLr1$8}&q?O}@jXNShUE)T>%4)Za?{hR^sN zzpJ`@W7QhwRHN`Z{rZpEPw&56U5-in$>g`gBe7X&qp?-D)%Q>0c69t|lz;3h_e9$5@cBTt;McMd?d^j$TQ(Bke%dJW zNjrXV4@Bm#xB1Kt^SaSoBi)RH|E51TEYLJx=vBktd*eDO?>2c>E{IL^-`N~rHK; z9Sj8w^{ovIaj7K?t&AN^a2eSew-@lF@dDrFUxMv?NmnCx0PhNB4>&OvwB;NvSP* zD~0|ZAv;sydYbgoKEsNo{q5A{UUwuK6J2V4_5h1|C&2iKU_mOX6T$**)k0A2p^V;S z6J>rhEa-hic!@<9qulNcy&}`pNvzY%{H#3Jv=-}umizrx0$f{^(x&>P)N|LVNuqSd zo16@Sb`|gO5dm9etO+fa@!2v}`XF7xN`GR$=D_^2PHd%q<=g-pR^?!vXWO7^NaQqD z<22Q*rwAdFkU=jlDsgSKdm{oSeqQuwgO%=NN#6jA2#b|WtBXeCoV|ixFO3!9$l!O@ zK(N9-uAbds+NSyQ4b@O=y_$UOV$)?jvRdJUC$wZmK3jK3LZ=X{lf83J0m6rnJ<He>mf$%u|Lzj40BZR##X)!Ir7$F^xT@u!M?axdNRKDXnQ7iM1#HC@8I~D#bui!uL zUTT)5+R3^saS3n5^oQo!L#xuxk6250XPnya?gFO+Xov5rt9)yCYGcN&hlEO@?0pKj zXH1?|d{?#(RA-)jWVM66IeJS6oH}?H1R4HwjB?!#_O)N%iuJ?r-YHt;OB1w#8i~~( zvZ0)M29sQC{1koIJ9>kRRIlIlzBHynn`GKtb;R>i32#CnD*VY`oYG6>L`Do)+dna& z8p>$|or$dpujYd#9B^3hzA_ zxtu%_7q(4Z&aM*{cIS_lwiVTai@R$F<<+8<_4R|5O|i3r27XAJvU5a6aAwW8wINVC zyI}CLcVM8hw~3uTF;G0Xh@I}2%hvXmH^55!>qQqg{HPx2gRP!%a$zO7xHe0%a4LlV zhau1oY&GaGY_+3of0TNJg$5zk#gtlzg$Kqr_v3^Gab>^lul}K+;@Xy6 z*ndc%KfPb1?H}@jiW)hKc>RjjB0rmg*Gt)}R^2ifQ<(5K&-#6DWHq*7rEP0a({8V! zT=JSjj98CrVfLDdt#fT zF+x@z3D7rA49)@qT__^z28MGi+DqV3vB7u335dxciR0Qbr6>aV`@~9GiYztWTiP~+ zI0QxSf)XwkZOYO+734kb5o=vmh=Obj%8JBwgc_OWx9=i6BCsY1Y4s3y$2kY&y!uj) zcc$SbGxEne3`)DzTH5DhN*9i#8m1(WSqV6XYTE+2Lja{^KcoW5X4g^ zTst$-M2yQa39NJDK!G76fTObFL&xr^$_xft-y4ubr4(jT&0_b?GX|zjs>Y-V%Ge+a zP8dkbV2$(1cvXH{LK!CwxpP;1_4qasi-QP~H%Oa@W__KmxrfN@aXH}YsT;{p& zAq^7=M#b}o9qt_+Nc%Y)zhOUgkLl(+<~DAeU+F-~D}OE+;PKyLN#;5 z9kJ*QTnc20Ru1oTIJkUUDGf(it{@Od+u9Uk;oC8QP8%`8**F$l#_G2CI5)7LnCag@ zop@KH_U0%k00b&Ad&V~mz26QWReB1SYZ{r6DC4-MguD5apoX zw7iY2p5%u{YeluaPM1p!EsBZA%mU1?uY^&fLjxWfHAStbVh|<|6rha?*g8f@}m*a+r8ZTr-Q@aR|rk_9qGJuz`SDx^WcD@mR*X5ViSvCa&;KZLof z=f3fguVH^{jqy1s;RFS?C$f==G;SRV_qCB~jjJ+CXpZH<1(Z8Q9o-?c6}Pdgmq+8Bnlx2lcHwV)SG7Z9 z)+X*P)+#6pio(W_17q1;Rzcgno>^q{%R<-30U%TBe`mMFk8O)^&RH%HSI#nQ z*d-tc-89_D%@0l1Z6|LOg!Fa(4hRbCr5Tffm9P5sftA#56r^ENQNrN6W88n6O8Gqb zQ&CVJwTU#8j71V?4ba&Ceiosfv4v~^{wb)bgI!>4=9z0~Rf|>_nhfvlA#2Q2;Bz2$ zHx9ks=#)pC6r$0@j)5isZ|=z@-`tW9HVif2-z8(02ZRJ+XhqN#p+cdInHQGo{5|s~ zb)i3&9fxCn4R%DGG{fXlAEA;LgU}-ydSRs*LsX$8V^swD*=JPLY*Z=<-}Ip)Z*4Es z7XsyIJod#&WnC|ExM^f@8OhOh>z(#DyAMr0371a==b7Nx2w z4ygsLi6ew-#FpVADyUV*)^}1pj;eqViWV@eH?cn!as9WdoX6L~XwtFJV-;s|ln&uH zU7kAy^1ACk0*vV4bYQS;PIPK_fu^e@*lkFZL4CuPhxaD05NkpQvOh~j*Ig0Z0<}-Y zJE4apx5`?MeNx*#H`%d>@FGr^b|C6lrtB6KBM;ASm{OqDAz{uh5i5r^w!db9TOPa# z=B-X&2<;D&Gu=O-eRXsFA;PN=U$yO8!D5PSFa+piSSAqBloW@P{h2H3Gd970uxZ!<$M)>NU zOT93O0=~ZO{8|e{$8Mshw=fVog#)oT74hn=o(y?6+#}HoNw<=41B3wSwLBWQNP_7! zSO0v5WJwyv*cO^>V@(o;>2w6O{JP*eaR1kX75PXj=TgpMfxDRUCQ7u_+VruGscsMynf@kvNMR;?p!xMnBWA1iBDh*T%{uj@nUY7B6T|iXW)*YSZar1Gv`FqaEE>~PASNW=`6M|`uG%py z406Wxm^joMqKZ+?iky6$6k#12dP*gNVKlVsr_p;J?pQ{=cIfC&%$0!8>1KyqjoYhv zJN#pQ4JBpl23PxT1l%Z%PgW~ zcYwgjNAdm>=>ljwAfN~R5VQWKQ_X0)2n)UN2Rn?~ku)~e_vKf*&Qco&yQ%{V_rV1(V|-t zV63K3a9l@|K1Hv#M2e(Bp4pVR#RkinO#{#+4X>4IRq4&&2$?ZGALJ~|9L7>Kw#$>r z$(v8{q5zI<=MzBkWD@}{(Ye9%`#We+|0J~db<~J$L8ye;F0NzD=IfB7V#<|dMzUVa zYV5CA%WNugu|BJ3<)}<;1}yg|g)$mL$__Ks9OYAn6^`(MaoBQb$9Ea;e;rCF$I#$$YTZiXMo7&`cIqD{tyBDz^5i`q%ruS>_D|g4! z$9#RZtX7fd^XP4{^QOk87e4V9-SKkse(JnYJBJW>Tps-q0;X^j{RurM|0E4%Xl3xP z0L9ncpRfk)e}y*`Ty4JM88W)Yf3J3iRt~syU)^%J)N+RQ){b`ihW5A&|1|JhTRD97 z+vENTk9-Y~G&C^P<+FCdRj2uCU}j>bVx^(SrKe-1VrF8}grb(ScK8aJe06Ko$ziNEmM)8aDzBaq*o z4wvQcEf?2UK;}!GuMtqx3a+%cO#h5faHYd#{`W0CF6+Ms|3?GbKf3-~1u0xwTx#Aw z@g1u_no?{BtXN_ zkTHO|;H{rTz5(!hxDX-Xauqu3J6*EVc_o77$i8f{;Yue1mDF=`yrunW8Y-UI={`NH z9~+miUfLIzHlK&AyQ-hJYruRf>MnAoRG@eX#H0y<#5OZUg@yG;kMO|gKPC6V7+l|g zNQtSaFp6+RI|M{J20yem1kgyEFZeDPozA?^WI=m?xW?E(D+Oe_Cix-KBfj?rP$we) zETH?gw{QzpNI(T#ksP1!9mWNDOcqBATF(HUDgnY!kBeMmfLu55_U6Wg!dcl_DT9Jr zKI0WmDL(vzyR%J$)s*?U-ls?_CLRscTDhi^uL~NlmU@fdL(Al+j&;|&;7$Ts7cYdY zMM0oE0)Q|O%8-2=t4}5%v(ZRT93VJA2YH-uj0*x^lWZvoDRK?u)=|bAiCrb~mc(#I zib_`w^rxTp_YBx2K$~=tAwqj2;HOXuZr_aEkni0{5d3_A{@4MHO)=>V5#-Qe2%@Y@ zqH83`pz%(ARAp{V)O4^l*gEXGRxG8c54l3((7QvNxzdzzdjl~oegN61C?s!zT#>RQ zVqpNy>{|tVstC;}XSs?-VZBX0y#SYAAvqD&CvnV}12Pvd+T7L~gWPT0;FPIw!y6Df z*e!g|WSKaL0Ynx)?+`hf9|(CK4B#I(u^^84C}cc(3I#l>A;C8}wOv5QCfhXPK$2ke zP%^MX^#G3^Sj4Y;Pk;(P#FH> zokDhKsJUKfsDNoCH8gN9-MERpDkxjiYHPlt7bH@;l`~*=8FMqt6@)lS5J)Wv zry|^mzd`-S5fq^G3oM~FA^H6zB?1nF1rR?#)N{WS5sL}u<5I=LoPHU$pK0ufRy*{o z%x}lRu8k$h4st>6nP}e1HDKWa)fs26VK;WUFE$}#sG^Hy7b#>U-)XKt>DIbx(_m5` z>uj^jX+KMW*IU={IswuN_z6y?jDUUbFvej&FKkGEOmuC{Tz)YWCnr@S&5xrvG z@S!?kJV8F;Kl#?@_eeb_Wk#LjKKFXH_G~g)j1`lC;IAz(t!?9TNkCcOP%@ z?C$YFO>8jtJd{vD7b|8wpZpE7aO5qv8IdZ|72SU%a~XX3{Y>-h{S5UiIE;kkX0W6G zBXWIs0}x;O`qvZu8_R}Y;|_KRXuHxI!$uy@>$U!s$E)r75@K$|^%5fQO+2>;kjN!; zFe}STMMM>p0oRb{kVVX`uwDwMN1d>JV#JdPY;5h@rYZy5fjN@#G zI%;_Z){UCZ&trV^h-3p$q-ZYV6tE0~LL$R&=+mqS%7Q_I3R4DC1VXo6u;_h>6|e{; zoE5b8Lo1R>lsX(tnz6m!TO{MjM$*uQ%e`ubzx1a0OyC=6`tq7%?N&h42y}4lP;{l5 z;5ACzYC{BVAmDzQpv5QJ&~X@GD+~+;LIg|${0z7vGT$???nW@Q7&d#J%!=X)v{@uU zLh1Dc&E4`a4VB0sBr~wxG9oarCG=1>PGVpE))BJD8J^)k74Fji4oQEsHEJj*hR)kj zB7%IO<2Hnrif{f@*6LHKH3ZjDpKD_sA}C7oZ5-%SAVwU$?>7u3Z(<2XCEw|^Q3Lil z-?tKYY(C*>S_h1GAF4d^62x_XY8g1UFt90ch5%hO?Aa~{TKzGg`u+?W=P^v^F1}h5 z3PAB*x*7@zzu{WCGx`b)b$`nNC!KFgjVjPi7_B~<{TDjxDmV||4@rLVbEIxa^kLYB zK>g`GxdO6rB=n@J3C#&|VxWeghWLe1b=flltdh+l#{~|uS#n^u0p;6ZO@tSI?z`PC z6j@QmJ9Hj+T7ipOjxP8LzD{p!8-b6z)({xE;x!PcLy{n|5?7(keSCVXwW&+|4DiOG zrvl8hJZ;5hGLofI3fg4JY>{b#nZlI?3o@sr!X<|}W{)!3GbRkcTY;!ogpo;->C);_ zt77at?1HUg?gE$vv&RDt#4X&MNfY^~GvvqVN3X}k$BRdtcZ)ELL7=^v@B<>oB53;& z7z0`ZkOPbZ`U5dxNnw>?)nXN4JcR;=g(FmnN{ORnoMilED`gL5A7!Ft%VpvU%w3CmyFIUms~1w7kUZ52L=TN zWeUXzwFu=7Kt+N@(qq6eYSBBUR!LCFcWLxh7&6gT(`K8NnXH)EF&LXBnJt+7SZJ$Q zt}e_oEJm*ApH(YSET7MF6gQVqS6-E5S7MiTZ+_4^hhAxCbHHZLl)?6u3Y?J2q-TqE zu+=nQ*U{{_Fm7pem^^+sn!Y=`Z*p69n|Iqh(wxDZTAa=<$j;NQ&@T7Z=o|%LuA$$; z*u$t`U@@?n-Kxw@#T%3{X&S4UzDnCB&$kug72?Y$%Lf+96ta#~9mpME9Vm~C#^9#G zlfaVPkvvGXpl3Cn;g+(_*kt)=Dzgn9SZ6FuVM`-QnP6TwUD0JTF|tCjB4S)Ma2lxC zT_e1^Q`g4PTNfUfOl3y(*UR(YX-B~Shp>zKr-V`_%3OWL@m>pE;VG_paq zT()-NGY@R8Xl{Kfaq75Wxd69csq1uJ(I>i3rS<$nG`QtMB z`Q<^?9gRbeQNi`VUAeuw$->q7F2w zvuLAh`|YxRvv3=At99YI5qv`5ZyY;Go6X#j)e+bc?BV@re?5A=db-HI>{VxTQ#Bm@ zR{7@o!T#0?qymTzs151>ss?rgbPD7RTL+5(G7rvq5OHW=|6rdbsVf~T$x64cSYv(V zw7%}dJ^`(;zAOc^ljWH|%WG4!?dIU@&hC!+ zuD7I(IElE4^qgcyW)t(Z;Y>n7qUd8BcHI83C21Ndy&Q&|R$)*AsxB%!u4`my=y(3u?)Q&{ew~4 zZvV}Ri@$x7{qHgKE%00Bt86GgkW&y(KJRUe?Oln_F~-z()elwk`WFG;JPg@R7L_*D zHxBln?(Pre8A}=KtT=Y^_HcW&H*_~@1Z;6w!^S;E4dzvsYxnlx2C{W8 zKovShy!N?2-sD&CoMXJHE~%hM$|&79sSP)__0IJ~ZG}VihwI`73v~;%Cy5Vk4;vFz zlPF0~wYpSm^znY%O!WK~TnHyaxu<~DZZi%&iOV45POKkYO!$0OJu$9CHbM?3|Ka)L zeC9-9r!Tk8VZ~0@zw3d9!!SqV0dHO=_f@0;X{rlNHsu|I|#b!o3Kc`<350mXWLDzm=`MUlX6oSJ``<8RnPg=g$hx zDOR3Os}eNNwcWkfWvu12H9r@vhA*l(A>AjKTo@V77}s0va166Ov^PJ{U#mS3ZzS#W z#do8C!NAMl^89c7r8~x-9u9I#_$`Cm;h4RQU)PiHTZGHfYKr2DeDm}2HOEcImDPW$ zuc*DNMSEPnykF=i4Vp!t;%K|_-3#=G!V!y0!)C>0op5J5zR0eg{dPAl%51h}`Zz1J zmpz>L{Ql6J>Lm3#eEV@2t$Ecxb(8-XEllmEWpQ$K#BviqC@oCos%6%4=GgokHkr6k z9$%f+gyM33cOP@gy|%T{w1htEdT(?yzS?NCcG5!GwC+y)IC0x5=p1}O85Hh04_pb( z2q%fN@NWLr3G3kMgnbEneDTD3edr$j6rJ@UcQx9Y6{gMTRq1MR-ITl7{-~m@vYALc zn|;TT$c24U{U-U^NI8j><@xA7b%_l;3+9$Jn&thZ{|vp0+WMJsXS}Mjw$+>LN*%1F z*v{nDei-?f{?J%zjRQCCF6`ZSU48BSt0UUY`;&ho8154c67EUwf4ZiBJiR~O*_Tr& z%FoZIYj0?P``2ZY!`1w^6G{8u{;SlNH~K$aI5}ay|K-8aG11cfzk6_*NzMxL%P7H< z?ab{{2tRQN?=ajTac9^?kU?ZaMSWy|f&z$~1VDw5;7OGX6uJO~gMaD^B0>B6=`BLu ze3u#MmK6}(Uym?{ysTur-AH>MRNY)YY+PJ6zVuvd0g!p~1L<*40rX+WRYAHq?bbj* z9PN7qLi-K~s{+`yZDa%@{3HN0_twstnpz?u^dk(Oxcw>I8Zl zLsmqq%tjjFc76m%u0{&smT|rZxM913Xd1x&BgyRSQ-%Zp4N|K1*dbA_MJSyh68c5x z4G;iCLLY0J9c=hBt4S7sOOf$j7)Yq7UJ?9ais}B*kkQL^z5KHtEqOw(fQdG+Bg~^f&)*EN z=mtWF0e8rOV(_J+0iw|PhUG(V1H6F?5a1*79XJV?N(9K12c(XdYY6~~mv#xz$=}Qd zV9wWV3Pj3}$p%aVM7ism4K$38Fb`Bvw>S;phOa4v-_>{IJl?VxlmfU^zJ>3YY7nmc zyfQ3E!2SH8G2i2Oo~MXP(8&DDa`|OA@9-TkS|Bw2Uvj{wa2def0X+lYQ-Ef5zbyg$ z_F1lB$AS><8Ql_aLCye2?ONRyYeM0IZR|na#=HWc2-NP0hu|540G<|7fxx^L;EPcq z0EEXgkNsHyR4SMj16zRE94QiOy9W#woU5CHhBAP@8>||!t*4~BpeIH%pQ4|lB!POs zWb~ch_t~#dZ@#ujg}8#-36cpuJ(99dNhexIy(YO9cuDt+9Se4_3w;O6#;XZUgU*Vv z3ZxNkF{0JywD;RK(X~q(*#^?3A6&Qnm7WJ)2b4FK_m9nJoPkc_7)Uh`b3Zt|BwPXu z0)_7w5Qh-&0q{Kxvc4tK4nKpwyY^}5k=8~n@wSO#5lO`(i3Sm%$`O#JHux%tn*G!y z;v{Oviyvk)5Vk{8<UMVl}5y zmeP``CfzRJDdtHn98aHoo4h+*8DAc2o_LzBhh7*FAJG~S zeamprJh?l$%2aKt_1Q+6_)KO+=8^a_(I^oxF}h4b(Rv6e5v{gM9nxV zb!Og1jYfHAK521k6>9Bj*~?rm+3d>ff==O1b?=d`DclvEaNhLplH9P|PTefs{B3>w zve@!zvV66C_5l|C-?m}bT)XN-HKCcJaiWpKlJZM)8wxrtlMRzF<_9K34CRcigLyS; zs;U)23puR4977)YuIbO@(465a;h8AwDB2X~6!Mg(6gaA+s-P;C4JS1(y(@ib=9>DR zgF$Ay`t=h<>j%FoODQ*jay9BqxJ}x;pk+cSQ z&cmM`Z8uF^N9=+VyA#*WiWYBXs!otje4EIJhC#hToufpvo3`C9oRjzJ~d(^u$ zyDa8n;|k@9L~Z5e$sshdUtwRzg@gKd|7;z24MlIfINaK01*OF0_TFJgG2yz z1nKPt>~8hLp`Otu(3nF3;(OzF3gm!fgwaHPu}7n#V5lHEb8S*|@p5qnq0|!;;O}b{ zi5{965)O|<5l3+r8W!#soEMf9bQeq(6i-&6P_5q+icpKdE#gGM!9;c^aI!0XoLLtS z-aXra+P$En*GQ^ATe17KUphKj6{Fp)YD@M569;?q`)4R(PwBoy)tz>;^FGL8d~5J) z+~ENEVdS;uli^z|NOC}3z_dVepJu4OXq70k=q`{n@2@S#I_(?Zz0!o3gnUx^(&^H1 z(}0GjhFLyB-LM*RO?pn^cIsY+d>a0i5!3k#MZ`Rw=~xL8dsU;>l&BAhdyxo&bV7es z=z6od4P#-$>=D^9_=CH>v^}d)?BUztS1(2*MrTLk4yTWxw{FG@bJ)tjg({8JW+`XQ z?w0NQj?-vMTuYJVt5KoRNJGp`tFVnZ*&NxL4jq&F%Yy;FWb5#?rqKq`4Rj@S?PAqt!t=!^1t-23(P;~snJa^< zi=CU6-m`#_!2RG{KajB6yoVlT%qw{<`*qj0D|#nS^G;eijlH$K1XcWno)-?#4h-4V-=W$5#ppFmjfSg&^mpKva-U^tdt!#vB5 ziQlsyeov`|spT%LErk9qU@~==aNj)kna#wWz^3F*|Ma+vct}mFA2Al6jGaKvNN3G< z_PXI42)qstMGa_ssNcKM#73 zT=>zNm85;N67)I2`Qdf~v#)Q)`X$ z*mL838oU%vi+jwC<=y06|7j=EN4+?vOdH(I;e@C`ZMvl zTDli>vUHMMs6psNxKemB;yGg7t@f_}YBVBwb~mt-^yBwq;&geom-B18vEFdsmHG5( zD!151NeAo8!h_|Z!O`BVv*~3~ao9$A2ls>Td+w|L-C*qTne>x%Vs<3Avj^ncjSG>b z(ZS|p!0qe*+`Rt5CjQ6f_0MMYzc#Obz?FZ)5C6el?EiN(<)d zIcrN@tN*kJncCSq@SEt`;WB(NkkI`XK}YwseOER$a4`AvZ1gNFe+twWn#9bEOZQc1 z{&i)fp~3xsD*x&EXU0FX|KL0{P|S>96b%b5Gu@w;#KiI!sbcv9L;bz`D}j~v&-?gy z?-w^l#=qnL&i$JI_x#f7zxDak_E(>;u?!3hf9L$E|I_+!UH;T8^z?s6{AL5^}my=|5*WlOzgkR>92W7 z=~{jPa{rszS?C&np^tyOxvv>~f1yW8W?EcIIu_b5FY(K;m|1AQfJf0U<2TjkwKBH& zgFaI8+Ux&;HUE6hFNE(Of|BkJ#jk54VrXh?;(*J<#Dq&N?_g-Dgv-Q=ORe%xGZX{; z7xAZA1egA6JxJ=h{IykF+OOjuExyvj4RLAzK}i3z!2UM;dxiazEdHh8zhKONXgb`# zX7`1o{zKE_(*6s%{8w^0Eh{bS{~3CO`funlqQy~VO&x(Lq_uUnsny)Z+1Z+)sTI`O z4=i}=0)e8bwUr&@+jk0!&1QOL`ppZj)=!7ejWV{&(lU9);@0eu-Sjkmr|F04pl6#wse#AQTiZn5>+-yu;y38hog#n4)0cJ`Hg%fEXNshY2Ls zd7Q$cU;$X2*#R)YeInDdBZIROk-H{FM!@x6l5_K+xp{i#dIx}VhG1gs9RON>%7n7B zxwbB;sJ8n4exAhlS;&Cx8WWN&ABkK0(Q9l zz=2ZZHD_if_Ci6CmXcULwNMkUu}6Y?pJOlRczb_C^c4WI__0fm-hUm4;VvbPf@JuRJmg*}zcWtBw*kizoJ2f&aN)QFjyS;(i|HbV}DINMhO^~x$R z_L>I#2b5gT0I12u!O_n6&ExP28;cv;(+8%?3X6)$M@c~a*QV_T+=PiKOKcPKCX(<>V*8z2QQFTlGcg>Mcs zh_t@)eAcTbl05qmOvk&^z$N=Cr9J>fi!yBeQ>}LM}bj~CcnG_R1 zi3Ff`77VYW$@Y=X;rrm5_TlJH%E~Iz;-M?&r%##CP*yiUk5m>_K#4Pr3IHaVNFWy; zxQ|cqd_A+w)?L*G=I zRmOR#Ej~NNv9A{6K8v9~!}mYShd!l)3)arhJ`(X)ya^9KifUft?$1EWO$hmq0Op2m{@XjP*M)eWAqn_Rj z!R-3^0DcXi()WqW1rWZ+9TNPl>*qVTugcyh?i5?U_jhV?AHjXN9e|pWPu>gwlt^xf zzPx8}Uytb*@M~Wd$xmD@fbeA;kemLo{<$?2h_tOFEAPFW_7BR>#ku!w0i_W`=i=5$i97xn5 zbaBinm#1g|2iqEzkK`2o;sQu4ceRrWNJjY+{PQ#q1gPET0HRzx z>HsJg2a}K8dp;DwjSgE1a$FwU%sW^AM$_@w&RfV=vao#oqv8ApH;rv2_;)R}s5^C? z(941QM1LB%Tv`tJ_@JTK&GhE<5b6u75q+2FtKUpg3At)rs8 z(Q*bAH-2JRVy|A_FH*hJJoyfd#7An)sBydLXL5(LV7K&GBIF5Ul7t5iG#$i_yh2%G z5PhMNLT9NymenDNiqWVoI39jSapaqqL>rwg&uXRa#(tbXOR{ZQVAF`Y&Wi5qSPdfIonGx10uSTyM+#E3O zGB@e-NbE{}jZKvUf7`K>*Riug)|q?n;GhXzyN5vPR-=EX=R6D-rwb1lcBZA+*C&x1 zRm$3v(;$V+Vm^sY@cMPx9aZKhKPZ135>ihDNa@E)U{SH}WrKd{hh6-jELkZ{zO0_V zu&Dl=yE$A4Pvw0Z4T3Sn1AiLeRTEcqD+sLn5l zvNiFvaB%taa>8lkItSb!QDUhEZ4k%CGKnTNe$`z{TeWFHQ&x(D6)F&9?IRndSObTo3wJBsX`7PU zx5f%6!ou~@V|ztRdAf@zFSw0oAR9Y|gGCxZtd6%nI!5+H#W2-M>Aw6m5HGO&R(&5m zK$s~6ERXXp# zGHc5-d;LaFwlGgUbvH?rdRPMtI=nhek@Wb^>ZGYa7Iy~vMxd7GaH0oAA?%0DlB|(} zywjW)xEjss1jN{}%Nf#ubWYAT1uI7*x@SYHqus^l>N&536+h(;&R}B-7xAkaZ%n`& zeB_mt>?ejk)p)UazV#>3%Ua->XO7Fw1hzt|w)Nb-$w`tp6mi~|lU7$8Fp(pF+Dy;3 zBwuW;byQhN69+zL$>Z-*I8*KWjSG5ga%2eLXkF!N1zF+Z#OTXihjlw+=cnbp;7D} zkMfM2D60JX?;H38pOAno11WoV?vc2Cgi~zK&6La7heWgt3jy5|$p9DdB`-}0CKP(1 zqvO1|^_6-jw5R^ntf7_~$;_dQ11&Si!RCIYTrS2S`X%Tdk_JfH)eJBfj=| zg@iS7Q>^H$(S8xK`C}Cst@^!7!=viW0tj}~I6sUMi4AZY@cF2I-BM8Bi|2Qp?T;Q> zRc^VktFf1-ZDcUkH=i^~*8PqT>u>0{^LwN;W!{7u>BY$^mp~yM3=i9hbPXT|w;EseEabBKD0FXH6R2kc zEox`7N8?++O=ML0afOM)9^VhPVdOZ0y+M(jBaP+j?9wHyP~<`z4UE;D{;b5kdAhn$ zgqdFA?kETw7k|W1&t;tZ0@a#o4CT+-RoQkV2PxuXEJ;8+T@%V8lEo{8U?W4?)wOFN zvv(XAo!d$rsnak_s7l{#8y6ADn_%2_O)1&(Z2S1a5RxQ-G+c#j{}4BB;J=vpJdwF= z=biZWETyuQMmuHPV44KgU7PR3z6os?a36k3d{+qX0b|KTT75hk7=LH!GjIJmX*;f^ zw(&S_epJu87lgzG=yF^r^`o(Uh_!D!oBoFq*l3Do-%}n*E%u5-e|i6M7Wf(&g#gKq zQI~Ebi^QKOdV8l`w}+rm?h`0+=YEA*rn=vi^%5`g#JfP9oWd!+l<(W$0Hn)=+}DdP zBT%r&NJZ9h`e6Did>qb6w-GIpqDOsX`gv^E6}|UWC&+ow?yz}tXUvF^0qLXrC0Ghn zw4)1+PaKEJS{?ek2sC$9c?K?THKQ{*E)!Y2$KkE*Xri`+9Nva6@a+Z&`>EbK$1c!7tfd|m?a;ImIUg{jH{1zD{zSZLd7Vclm zPgan@2MGHU%Q6juuh;hDNrH&au^yqex=rEs;pJDXWzw=zcCRYH1RP@E3Lk%yHsXyBrhOYwz(z}TaW3hizt3pyg3)HD)rPcPWuM`2ps;Q0 z;7(kyT_)qdICe=w54&8WhM&RDEZ|>Na3~M!z;Ah7V~KU2hPjkuf~^JY^BO2{FSCYG zW-1Z(#ux3o#ME+SHVBCqdho5Rd38`EuQAO>zZpD>fBmeB5K-W)UcxB7{u0&EOf!Il zmPC!u;6Ec?JXt@TuGT~czIhh&OwqP5l}G0uYfkXGR%C&!za4T6Xe)WYE+s2 zpm^+2G8C{yT&P~&&6G%T!nS}uAXd!Zqu(3Ii}LvWSe09EIuQMU#`(VVn4^i3Pm@Dr zBv3(QWcK*JRzA(*t;!tS=__8_fMWJ~WQTo9hJixgk&p2!m8Rw_a9>|`X=2L^F!eJ@ zbY#C7raJZm_radsY znj5o-Id>x-L~+KFr?TSvUAyA4-FTf%``QJ~@;zO_!?M%bW^>a&|8kld#6Kn|1 zyJYA2QQV0Msa${9oHOe*;Anonj3k*hNT}UMZ8ScsNj-i!te5`4gYVpC1S&m?@CjmUcZZTE=-^hS z=`WJhM(0DW&v6c5r7Bj{+(kx*72v)p$ZRz|5UHOSMOMJv zm&3nYk#ZP7Fs)>8WvY6Va;j!S)V(1NM!v%+4!YxCJke?`y5q-bL?#1go2tzt^#vs2 zVVL~D_A)yx`L`PAM}T_iqZVYI@tSbI`rzU^WIGl8nnG~K!M z0wU?-VB#G{qkRL=z&brBoOLpw(~U}o0lmDtb^Ycz%Hqk;>b5np7i_`QJi>S6%Sn|} zTkp&?*0BjFUz2tMODTB9ceV!e;Ir$kGq{5(L16H9skrlr#hrXQ!`wcP2=8~52PHK< z`}NBPVtf85}(VK7eoC9s!a68=TRqP~*?zb+Qdf))G5Xtij-Bg^sgfESU$Vlsl4 zzO-YLc$&7>?C@PIY%o6J7FqPsOSG-lZR(?8xuj5OC394T4f;X(mP~yZx(rkN3~1*v zvnjOQJOF#5#O=3Pu*fbdq^>tHS`hvy6gii<*v?@cuMK}Jc|UPHa(=V*%nclnMk0-1 zR_`IU40s*YgmRuCR#IN}Z^voNwQ|1~Cb$>tR-=+qP}n zHqSQCHt%`n&hvlg%}gfu!~M|dWOXN9l}ak<)LOq~;x(Q1^;IqDNx}aiab;*F_d@FF zdN#`F+@pGaFRzOWFM|$i*5uP?+X}4%`iuYyY#q;*8r zZ8Gtc>%+*{L)TA%DHyw2mlG+*oN<{OKTUh}hvY1zmw}W}gzAp7u|t&sc4Xx4heOSc zhw4EI!{>}2<1EC}xnu_jHp}r0;Q;F=WF-J^1ifVSCP=SiPFLmvLoC{1i}4m-iJUA% z^CDpoeVp0@+0b878B;Y+a&v8J=C-I76@Qa=lshyUMz82BsXlF2Wz2Odd75SjW0kwB zP3RZy#KDo8H0uGkkJ)YYY&DrI{2}sAR`G*5Gsp*{OKuPQ7FDGgLGC#>p;$HEE+_w4 zJ@zxHQ|g3?obQZQ?+nXDj-wg?7n_OS!lxs;|D#5BGRqLLw;;QUa~k1Md^+@ko*M0> zu2gm21iKlv!vyoJJwNvv%Q$vqun&aJ%$1P~!-&PT^$D)>4n-G6C z=rreK_&IyFJGtPc<}&FdX>k@HA6Ld)OpGo%R2PsrNkS)!#&7qy_8C?gf^U|Rq2MSx zc(IltS)3P&C|2*SP!{(%%cSVz_)P+2mw0q|p7}wg4zw&Jsc9IQaE{uI)FprDHW`g9 zsom@@!mxY*+G24GQE41zYTMZ zE}F!AsUw$vRe`t;3^yDV%$Ukc`90*{@p#FqH!(i4ybj8zKhiUyh~2nPLxjYlq{-ZU(7 z6IFN0-I7`N7@+EgJ;gpF$vB{wov>3zs9grT{7w;rh2SD~InyLdWB`}{VHX>g;4NX3 zyux(6%WH`X?+kaen(sHu>kSFBj-BNM4o*dF!|(IEym87P*_Q&vF!y*G4H2;V{XQ~( zM#2g&9xN4=Z$($&I7NgNZx)Pa&jZo-5NCziClC!qpL(}2)%boi&5j-aQEM=y>u6E? z2D200i7#QQEhJZs7P3=ilNPfeNhbDC=zgPs<*$_@G7DBS>p=K zy@<8yfziOxwX}kmah@fC6v$J|B0tFI(UNx|12U{Q@`q61c6Drp{wm4uz<IODt+NC()`<8|P0HYj^${xe9-} zW7|!fO_Q)@;TTOonYHhc{J6&><7Co%z|bCBh-{gaEh(89vA4&J60Ggw-a69q z)ye^R;CplH_dxbRfK?gZcbL*xXgU|V{%c+QuGn3Bq|U>4Nx+p@6>`kd>Yd) zCTgzUwpJp3C-`QO46ITSMq*@M8Z5K;aN+X<1!F&q{dxt6Y9H=3D2|{O_I2V_o}A7y zC>oq{HSC90GKa@(C{oY{&sKH<*Wi5ziM-W2*K730uPW?a$O??xpMJ<#t5pE!N-K^E zH0p};G%R;*%+A!B@$l>w0^ksSBScuE}cvHlyHsNhh3LUmGG$Aha0{9io z&6e;P=g2IP8)9S%>(+HmXB8}9cGk?9kf?~=AR8tL*5%r3o4T|I5}<{!aN%>5h*Pd> z=_WCkjr&uh7SR>J8-McK7Cm1VGt^^yobn2l;MUoowG}oN3gNBb34p0o^|Jb$s2Zwe z0#~l{Du%q6V1<0^IHYFU;2GLm&dM6Hcky;wL&jV%A%IeF7{!+Pt~bc(){7wT52xtlw?;TU|3#W zJhurWr!KV56E`OX=KKz>2W#A{Sd|5Li*{Xescrt7D&mW*tAZ3M_EheEO^eH`L^lQ= zFxzD;36&BD41up5`4Q|Rxce!1)~4crWW$U*@ha?-09O$&L(A_Bwn{wq1Bq|=gX1)FY_(8rz));hE+M0K8=CAZiLl7z_e&IenTWSN z0P+uer`S8R`}sx`Tj(%)IW~LPgGj+juLTDS-*qhe1vuF zLYLw5H~J*BC}^Oa&!nPT5a214!r#4#HXPDP95fhFO-ZQwb*YM-VC<9*r|hOdooi*+ zXl22HzQNQQfMl%5ni5fbNBi9kywDG1T69I=cRb~ujI}gmkQQW?$=zlW(o|zyCZDJz@_DxE zToffSLpU;im7>bpFK*13DF^LG{vqyztQGUdyn6whT4D zb1;PB*bEABoZ%f#$z*D`#MghT@RO~i>UVYMw*`6G&q=uB zqy+F5?-Q@W7R$0_GBOv`WG>w#Asjp!cEslrEFaB5CV$69$QnF-8bERu>>}CgJeSMomF)ZBYNr8UHDmF$MvQaKZYo)l!vR+- zoPuA!^e0dd536XKu$b>n&&v}s8$@gw0AwY!n^3`SQ!cCa8exyt<5W0e(L=lhnG>59 zd+^_g+J#xUC|ny?LNaEF&sQG)2$a)<41`GTKqlIr+zi5_KB6q>7xV=Hm!yVESEJn!3?gtLt6AKI^PqgWiZTP zoj|JC0<5b`^5}QorA742`XS0H#~+OmfTQg0`rECOsrdyara{= zgJE1JS&7(_DH5BT1Ok#@2LFI9E|WUryN}uLB?*-!;a71z#@grK_Jay@q|qkGZF+mk z?sB)=OVmaV>QSkbmCAYeB(B7b9#GGccrsgsC4}f|D3Cs7@VN)(tKjF|hl*S|i}35b zlU#w9eQ5(ImA6UA@19|yf|WSh66RWDHRKuKiLC@4c5H|OlC3FXtit51tkee5$} zhax>$S3=ICQ(Vr|arX_5Pv;x|JPmthcVU+P(PvZBgtw614RwZxS8u7XS{e2Yd)0sK ziK!9t6>MB!w?QF!=-Nk-gB1B}zN#wqgLw5GAdq)wf(fZRQ_3t^73A2w&*o`!zh|}R z;jw1X3^)|B77>gf*q%v&XH~M-A*p7VMPH%jMLlw0>0`yQwZp`o8BRZC3-A$&6H>4o z@;GaVP1q;A1`eH;8P<7XvLnH-ZoFK|qQsf&OoFdF;g}9<`tXoeV&zr_%_*^pi$ncr z@~I;$HphP_5DV?8Bub6PB6+BEN*<$f6ysaE_jjqGZouf)&Xt)?#&Eq8(5R6MTSdKx z@@&StV4nbW+lO1=vp^v07CjYRr4laR*J~B*pL6P5;tcAAjC_TVZjh$09ksSiD?lQQ zTPcw_t+-88E^#_+h@XX)<*=qoz5iR1tkUM?F8Xz`oO| zzb>B<$vQim1t~BIJ17d$vEel_bLLDTs1FUN)JZ{7;4M}A(?sW^L@Llaw>gP%GoHh= zz8DaB4uruIr3bBC^=s!V+}WA5!6A3zZB37 zSO~;cOVL@4(mRY7ljgBQ=@(i<_v$Zmr^XD#jv%8wmtJeMOJaT^k}{=?N2WY4)bYHr z^+LtG9BxLj$g|Oijj>z8X`^{WuAU$pyR0${wj-(@X`hsb>`%Tz0~02C8)83%yfv+# zfzlC8v1@2j7Hx<+E{+3$3Yqa)V`FI)zPL`tjh0LO%2Y>?H_zzTnw_OkE+wiN#xY`G z7=4FU=G2B%(#p6vL(RBE=F8!;4P2_ROmpDO$K;A8awg_GT&tVwwB?txLy))psi;e- zE&Poo7HViSc~{~Ruyr{kqc~oa_8i*UPNlr(4Mvf9H8|njaZRt4r0uA^tUr+P!jTL; zmvp->Wv@nD_Vw`EL)lVn!8k*BEZfo0It~TiaLCR&J)e}|nr!tRv zWAqn2KuD=rI206meu7_^>%Q&q(qk_WFC$qdd)2IPs*Hkdi z{Z8Y(i>tj3@61T$OxZLjdC6KaOLceC(W~aQl_RC1tIWwlJL z?(7=J>OP<%OJ*bMs2*a!Cr7#fQ24MCkiORKx+ms9 zY6RUKK|h8@i8HZ<;|wZoBo$qAi?%6CDIZ#$n7H`z&AONg@VV(&#sJUm|nZ|0SL_$Z98>kQ4) z^w^g`rcVm?%3+bx@m^F4^Tz{ivi-R`zFgwyV_A)7rrOJ$oW4t4Br~pD@1gh^5ZXps zw@ODKz(axWN1UO`-@_vXBcGS0!MO)!JVp`zX;Uq3wa#6|^&W(#$El^_bXRZ^tvCtv zUAH$FTiaRb7IY!#N`4o$p5ti!itj^NJJP~VNfmnS{sJtMNZG#UMR|H*lnWL!&0e>L z%d41NJ({hM4bJaexGsV{B%eGByp3?X*dNxXc13@72BAv1G~k!6$DWUx zByQ`Px$aCf>k^nK6RVP*2T;f%aZhK4^=iaA^FXtnN@EE+y1cerB6v4_1#fWs zBs;_~?Q<^V4!RVK@GzCrIR@32PdSi194N8Z-!70Iz7s^mwldZZQjhvRe&G963kN+;SY-Lw>eAB_Y!afWvN;ie10+9~^& ze%F&$3`+0Z$!x4SWMH6}7Qi!3gcg|G4R=Jj11QwR*^_!HUrSDA8s#HigXgJWe65~G zYUkdJjK9Z*1Y2pJ5MR(sm7clAJM(Z_Qtu2G3}fMW@4XXfw6weI z%GRO!$qZV@{6QeBx@jRd_uC`YoKX>I`U=3KH82aWjSi#{zaJclK7Y4$(ix|P$=Fl2 z5p8%VZRF8^$#1E@-@pD#Fb)iIm_By1ls|IBq5Kc4n*+zxbO zC2F9A8sV2+n*J0%N8lohz4yu)rSQkR)N$$z(KTk46{(N;?$MZXmbMjamu z%iJmU1BaG#bF)N3>{E5jD@VvwRn4(?CfV7<+cURf*!A7xhf&$st|)*Q9bw@+g%VjZ z&z7-nu$c}wywYfqcY7`*?H41r;6K^%=U5T~7#k_l25o06TnV5nWVuWS>ni0*EQ z325+8Y`IKt(0iZ7)|mv!2NScl>;h~LF$KJ00YkYkS+6@P0O-76PU z$3AIv9Tr-d!$zinS!dgiqMVg@%=%gx9(c?(>XdL?5eXvmS68s-UHwgf|97rcJ-$NI zoUfQVUNrP`3t8VC<+Bn>&f14qG1Ne%^skBJ*14P?6R7NICQDuBU06 zb`iRb-}S>nL>t46%FB10d|{)F8hsbnTC%8%HN_*>Esa*4yU>YjTiIXgSWs09bhgbf za8gjgQLK}p)kSpORJ1YP*`rWqKDBywCT~cZsp1#Q?0(wy$?53d>Ds5~d1Rfv1(WkI zQsyBrEk#7&RALHBeeCgiOMKA8ZVU8~d!VYbaCRp>dDVL%?FsWr6-piW_-saqPub`& ze6I?u#c;b>df$KtaE1u0N(!T|$h5XS4P5dt7C(tY|MIMFvW_T4tOesz;*Me%m+I5I zQi%R)$XdGPPQJs$6Q+Bv15TOPl**XRF-!pU)}}l{h0B5aZ4+2$`s<{iWjWHnWHUJT z5GZxqULOXvOiH;4Z=E$;>PR0ChC2+R;eDNsiyLEH@k0)Xzqn)-t(a-MK6SGKjc}83 zMJRA?Eq;c=xXp^$+r0fy7T9m%sfRTA_t9E1=Mo+3)&p0Sctd>jS#bSx1T%v(RKUAk zSeqd2O*}uU)-F=Q4edy2f1^f+Roexk4NT2I!41lASrt3XmPAFywimveb7@tZ==0#A z+xHTfdI*u^D~9ylYi;x>Z5CaGh_Q88g;d#2%-WiV2R<<-V}%NZ8IrhO8nu-chrlL7 zkcK4#t76tbWW+Aj$qH_EMzyYF-*y_&WG{wVSj_$(D#C}%uz3}#Fg2^_hJf0p6k~1P zT}}IA^z;YR?WgxSNwcPf!hW{-xKhfj163a+sZfa=TbYK7Hx8$J5D@ACF-y8Us8gi7 z>b7F)!LZN|N>nG28!?REnA|RmI@RF2kPljAfiTlX5PYoRFOZQ+l_BnCcy<)gO{ z{#D%Gh{avN%*Lur`!mL!5N{XqqSq}R9e0?7dB{AgN?4+|9Rp?^3Q1oQO5#O%jl0@I zwdgLJI0wX`VU*GeP;p`nXs**v@*6TzVqz!mBSJ62_CGRc=j@F|T|osAh&eF7qc5~N zsOIF`zC!ziZMuQ*^PqRo))TWOm-q?t(5yDWIfj(skAO!z&4el`H>amYN$+V^IY=hZ zM?BKi=>2MJHsq&J9BZ*(6&sP~EzLOq7R$VY{M1@NOyVF3YlcY~2NA_!M2R`%b-bCW zaB8Z5&-IUuE`20q{LJgOBAi*>3%xdQOl@~zuVD=3g39&GHhwIHhKcxJOc~wwQz!+r zwK0)Dg9|<+w7Wr4mBk0o~_Z(!s|DyB=CUbZXM<| z9FjI2qb#(Wuwhq6kSy>Pjt9#s`I~%2^!7F!luj^@^Q;MfWN0~=Go+e|(UUDNU>N7- z&l^I&(kNi~a%|JajwI6Ph#47`oc;KM?Wa}Ne7lQr`{-Ph2ne3#RbJbuCik7(ttr9G zVn!YAY!lgS^El3*^EZ$?A8%^sZijm|p_6BpL*SM-EzxQ*^%#?Du*Kf`V+i!8OtSq>okuFdM}!?}XgCmWK6zpDs9QIRG| z@ZQ~K`2Acv57V^jw6ffZSiUMz1=8YGLk7;bN=Qog4Q;=~8eq;>&MHH+NUJj`MlK{8w}{n@ zvsO|bVbUqHA{XmJ$i1C(_F!|wjWj$(qt|irn=CE)^KFeEWk2Z4agKkiWQw zUk2Le8BXS}l9Ff6&}@INT`*cAL+;L#)>zMAGWSf{Tf`l#J4*6_o1y6gqhCj;ue{aG zE0Iy#KbM-&`QEgTG39IO%oVAhH~4bcIPS*R{LOs^@0a8Bof{&;7@dOV!jw)c`tdhJb-AHJ@ffu8r=swm#P zB5gQw&PvRSZsLfWbM|wf1Z<+HFOvJka%sS2l!FmCI+=_{MzWrr8Kv6&Ftgo|f(YPw^VbW=sJto9Us{uCC?dW1J_@rF;D*|;W=fnM=gJo zYnTQr5?Y9e$qAj*QhPs`X^G>4+tIE&=lBbLWj2jPn-q70g5ATyF#OFHQ!DWXT;*}j z?Vo>2$OEtJyad+6%C$K=Eolm8`KbiS53!Kn3rh+@Q*OVx>gzcN)%zWd_V)oH9 zbS;6|oal5$q{L38?%l~zrnxTNB986hqcSQLMootz>Zjz>xO3lR;cmJ5@@;?d&@=J& z7Fep}*tpgn@&@aUlJ`1KWiumI1?>A+uY^(@-^-*)i&^vRtg^ z8o5t-jrB=1>a$aE^R6V|1OC+S}*+-Q4=ZVpb*5Y7EYb z&n~0i!qWp%3r=4HctOPjk?zVVzoivMYl_w{Z^D1!n3hL(4&;GOsOl%396iqZr>HsE zhVX!use3jhHK|9f_VSsS&=o4N$}o$bHIUh+OSu6>Ot5bzV42h(t6ft$mj;^JujU5h zAug4vA$wkh*}~>^;x{CRlcT1g^nXen?$nN8C=P{AmeH^Y#V_Q6NgZFzW^fvmidyIY z+O#=w^D<}&S{#vvx2XdJB{QFDlsE?_^hUkPcDxBXr(MU`4ep}zp1u+XFKSU(lW=G= zVQM(48%X)s6MM*&xo~)nxZ(}Z^tFUC-Zgh5N9AI7DK5CJY%qWrLyuhS45nZW36=QR zmcH3LRVfmD)B^ppZiSWb`@jI7tQ1Bb4U>B-e*>uU@MyJm$AaILReLW--FWR*;&;h! zaTt|NrN{3_!RAgQka0g0zw!)utL^p$3vSkJQAzAP?k|(IPP=pZ=rdo7iG{ZoN*Azp zC$37;+qvx=>B+8BVE4N^A zz=6a?eQVn#@90moUe>)-Fa#t9g5}ESKRk^X5M5{BGX8{v;VuT%y;i0{`fy8elbz0_ zoE0#^oIl^GNXSeH2%a?DBP&SWVmYO}+J&e=ME~@!cx@wp%waOP3Smn z26%>&)JQx(?Sg|tC#fb!I`x1*au=u%L60}>TW*OoJ2ZyBxIo%R z={L3wpXEJvU_i0z2WW5uR{gBCi zhdQn~$uEd)^5#=XlA@cFaO`TZ7iYrv3NW_mQBdDlsJHrdM=rI(-!y6#37g8@3RRd0 z_t^AXPoq6w3`xFIUK)&^m#s?Im9TDWxJ-xZs1pf(Y-!G)1$*q0O3d3QM5`E2I*A*h zaA1yMYcd!hTS2S&VH#CSB0*}xl{_ah`o~eE+FAKz`>L!Ha+WSKln`{+&Bt^Icq5d@ zcpg!WIIbJ+oayCQ9W-7N%ieP2{Q2CXt4sqj7OxU7JG)C^xz6G0m1H@%fKCuQfOw@3 zOowq2uZ~h0(}zFUYJ_KRg{}6$lb_$S!(S=au^TNA_)2U;R;8QuNqUnnyqx`ZYUsE*a z8YiLdm+n?CG!x&q|5}UfJ#n(nwi}a0Cb}z|nA16$c`EXi2j}psn{AP9XE}JJq znlsI^%YG{zzC6Pkd}NJauu!rr8V+~!JXT0RTz}UksGf$7Yry-6q>w82*8`$*9t>c8 zIXLTlE~4%t+rmG^6x}CJX*-eu)&%V%Ox|uWsUus@T+}e8$q|3>j)%v&RdsHFr~@di zE@zQ|e+3CSkv5qP<7g3kabxGfkDoQCL@;KnRXEbbLw{nEvgK zZMX|4sR+jl`6oA0u`bY8Zb~4Vc3-WKh8x^XtfV+gUJwAPF73&8vyM ze@Xhcm+<;V^PyVqr|uC#aRgz&yUel#qk5D`G9+kGavZ7-gZQyei!p`@gp}w4K*7&K z!3RJz&1n>u(dcatvz9z6rp;$E+w9zGfylR+^ct8RHYY^g79j&<4Yy44r04Ze1& zMA}zI*Nh|Kd`U6DP?71?fuyNQ^~v)9?O#o1K(x*E4d~^!(8SM&O{|t9uD2JpjWEJQ zqFIg8gaN8mlCX8jv^!Vn1k7E~@#&7w_&Uf8`_>*mco`pF9&Sl(w7;}ay7<7PkE%+{ zHGn~Illy@S_GGsBY_8DQ-k66sEFX;B;U!hx#T$U(Px_urXj?;xebKXZOZ_Quv`9WS z!7l>VMOq2cu1lzg8zl>ogSE7&_d(fXqhpdds1e>^G<~_*U$yXH#g~MrrGC+ec}V_6njt5|7BEDN0NyXs!rd3lQ`1_^vG2|$UW z>1|c+&mjE52s2JhZMD#;H`q^_%*w}wF=<+6+BuV|P`EDWoI8nX5$^1neOjyW2U+a6 zUb;+}s8Mk_IjK`ivh3O>3BiR5J4k-C<%;j`QpGRUjg~K@g@CRw6Yn?Sp?6MYw;Cl- zaF2cr0p!rG(A3-_YG!BM4(w85n%M?&nM}3}45qzb8P$bukhOVJ(cFLg2W=EK_mWK| zl_jV0pdp@7!IW`#H9T{FD#a|Vz#MHJR{V0dLMggK(FZL3Tc1@?pkr2c6*o+chN!{M z`3E4Qb_C#MN1fyH7v`@>2~P_Ol0C{4npp4I&&T-+OG1wQJJ^kJ6B~Eq=js+$3fr#e z#KJ_F)r&I7X(_I9doT%bR5lS(ZAwQ=2N^3ck5_v{6)?-6f=J4oVHx2Txm?Oz(#6Ra}KyH41J}GZm40&u3kY_5~F7>A||Nw2>q!IM;3p z%E-=SbtR`Ytju2!{b}O+Zd(-fEdhKefKVg)Z5L1u@~ErTo|q^b=f>2y{Y9@z_wQ&O z#86`MM5TTC*K(Mz1Er3}1F|cmh4_Ag$)u-S3*V%9U$g%iiJ4D4cY8Z?YnS|l4qIjKswyvy_&*2wsA{-Opz*M-T+mg-?p}XWgIRQfVv1k|7MHc^;Mv=y5({t$wUV4tBzgd#BNi( z8kRvH@T;cZxIk4>zh@oB?sJcUdn~gM)tAx2yUTX2oX|arkgKq(BHe?#=Ua%DtoZI> zHELqw1-G$>4+@u>jdBmSOyS_nLYLi9iarJH=ItJXkEg2jrKiafJ%f|p9D>F}Xv;;? zoN>0BK$g?F`D1PM%6r3wQdSxsA)_7f?dl%R3 zlYr9H;GNr*pvLp9_YV%upJ>az!_i$uyvc60QOP(mrPyU4g-Pw3c8BzHSZiNDGfr9k zgh$#1z~B#B^-#9H$Wc)x`_xbf8Ft^elIe6#>O0a?a z+3pgag)L>o`Pox{4|wUGhxDv7rHw`0551+;Fg)l6aWy-}BPlK_1jo&#^@s)_vOi!*x$+_7*T9?>jY+3NEEJu)Es`=bs)@GECa zu2)*L?UdGLWk?AR7;1LM`=q>n)UJITwsmp(=QV1&2KZpd_Xju?!Pf8?UW&C<0{0kT zt0pLNdUj;3w|u#+74-2X8v5ZNxq*4bLe?1xC#5`%Nfy_EtG*f4VwcLUoA{n4_BZJx zA8A^W=dTq=^tM-`HqD@>Vl$3pJweAv@q5^@(-63MBPSA|lVEU`M|sd*NS9{HGwBj| zkng{U%RGW!3h6hw8=>{edYPZAA$6x~uHg9+!?N+7MgtP$3sO5zJ35ef-6P@B`UYt7 z{omZs-Oqn5D$^_S+UmHZIdyU+RP)_*6zqa!rRh^alpQ)}YJthqWmov&>g-FyCapk* z*V-Gq3F7r!F&Mf+02m!0G}&Q@@copdo@7ukMTi9oP&y}CliW|#ewONo-0mQ9d`iAYD(i_t<5zMR@Sdd zHoSpA1=FGB=0F?2J0~G-!t5wLF&fC}qhc5f=~Xl!PdF-)9{+D z#1tE+ozdF2kuQHS+9#36Jb_9VSzzF%FF*&!oUG6^CZGQi8{0=w6@5IS!|+w?Y|wwastoSqOs1{C6MaTO6+U z61T&;py99vmYwQde44GVf*_Ss!AI>(A7&5W)6aA_=O)7N*{&5lT3+X9dTsH_D(`lE zsi?0w_{?IMa)$DGopE_nN7Ubd6G+}et(?*pH*T_6_W4-`r zv9Ma=%fQXm(`cj_M65XcrfTSnW%MPHSSCjw{Sb-Vxe$GBoS>0>( z)9YwP`$JJv0U-j@n(Y0{t#!WN5j|!1Xy#2id+{SK=i>Z{x4xYOwklboYVKXD?QP<) z>0k}b9r$2hj~%ey=+eUmOnf94o0auN2cz>%szqJ$PZx}eLBbWspDo>CS&k!FZ%1u> z&wIl_&zRb7{Zka+S&HzeWSMs(YZg1DzW%DunKh^I+k1E{A}cth-)u`a_eH3);Ax{H z53%dmT7ASI60vFWY)`NRIG)8tq)&bKQ==+l>8L3H&z{_TDv)?QpO|DbrnAltOUhsF zjkz+wj;y=vzOJ;`m|Ayny(!H5bDh(5)WeWStnHVudb&>YuqRbEckohtzN&P!I5)y* z)cs<9kM(y|?AaPHZEZLXl2w2F=R4}QS2*=@h|&ex5k_DPp2a*ExjbCD+uKR7YW~zJ z*XdHsAY3?8g+Itw}mEc ztafWgEgnqpqA*R?*~wRE8c%b>F?~0b$L@h1*4ePPpn2#TM&Q~(hAsL}bn4zi1-vWF z5^@?u67S#MWyjmmeR)(a&TilI-N3#A!8^uW_ho<6kVrr49^pUx4*DR!;*R%&ME77u zkAUpJ_RzlJ?Xb+}{&#=xe*jzms9Q`-Oq~DGqW{J27}?nwng0|0$J+YmRaO>;e`EiK z{^>gOUohEU|Lgy*;tBtw;<5klDBeFBpwjy>Uqk-CSZe zl-j>NoBmxD#(x6;&hcMLjp-lt?mu)I69Lozt0Mk~Qu`m;-+!JHu(SM~(Eq8E_kWYr zu07mTmbO~y0t18flBq-@of8#6OI~)Xnh}s7;z5MU0Ywxt^Xo0CBvSN1DgqY~5iFI8 zCF-P50V^H?Z<<~)&_RPn0J^(NC9=6c*`fAX!;E|4F`BRXHdN~!C}RC9gd9QGd10{1udm^P z!+{Bv8WAit889foz@`wweL0Ql8wFI$;TIvx$)T1<^AGd@^iZS2_|eX<0*QG}^rZ>- zWdI-&5)#n9U2w{S1C0|HD6q=GM>_fygUJW;Zv>EGU?9ideu4sjTA|@i388`B-rpmI z6!i)*s++p~8RpR=egR?)_3UflwHn5uWFSEf}}5AV-CP0u?{%1~}!cBx-QLfD3v8|FWvW{#zbD`lCl2 zZ$UonftlUSD_~|(T*j^=jrzJ31OWIwnti~LPl5rHnhFSfkSoxRKMm?EXLoT62%d{g zc*;QdP0$TK3LH*Q3;-`gv>!|lunT5w@iS6f9eodc zC^}pmKxC9?KmbBS29``l6Zk!01R?sJLHug3Mau#NocxAMQy}#NwSM|y0PyyMg9EtF z-|@%@GBOzO=?A?HMgtj&$Vc$A#{`~>|HfbOk$v(*c=r=6zO%LSr5^G+C;g{Svw)(w z>nG1OP_M@kL zAtI>bGj;OzFf#`}of`?3eb}9Fjvgf+BG|#>(`r+nZ7_at8zL6G{roFWd|I0BYbj zS48EJlYZqnfj?e$Bz5#GmVeAVJ@<;)rp-9+Y}f9pt|&4;5%@Yynm?7K!VZhiiX^Hv zqi1?P4R>kAPKl>=ia8Rj(?HMLzAQptp2N&UE`eFo8wAa|u z&xXg%m?yHRF1(foaia~<+sH$O!~G5uywSu-3i6$v*qZL9?B`hv(R-%4jx0HSu_|0; zu3ko3B-Fz1-dQU0wz0JFju;<~gCR=(fr^HyL>*UN9FY-AkrKHYb=2(ufC7$nkfwO} zx229+VrXq904~Brn1^IsPV!?p$iYq5krnuL4LT_Mm#xrEQ(z#q^mGk=sK$g_6=>1U z4iq@VF09wu-mj|;Iho#TS78pr4L!o+^RT>~1O~WTkmf;Oy}fYJzejVPLJ#A@a-n9c zz1Aci#|?J26H^P4aM%s2B`p!Q`8#7|k18k=fjv-?E5&Yx0VYuq(rCD|>7%bRuS%WDta zHsn=;!ft@Z-bvD>@^c_iL3MW)_s=;Vv)_j0Bf6A@;(7CZRyU`s;yV{>4YeF!X6{Te z1D;~qlcXk9&b~skqLzi;rMJKpG`7fYIHQisWxkn+F!s%yC)*_xo9*+ND~_<)^F5xH z@Mf<{oB(?RnK1;3S)ApLfg*l6i|(Ug_CelOYR(tRu+MG7Y-AOwm>OfKmgFINRBfLz9&YYM2@JsqeWmRsb;$3m|97h*oiRC@pnR=oVIpBz!%>uc;Z z)-9jO;az>&DyH28mt0s0FIftfv{qV4INcbj?OiTA?AGHq(m*GKM;cx?>GDMU6F2$6 z^%8Ov-A`Lt1)%QR%2M-SdVX5Yk zAwuK#hvH6FpWXFdPa_v(ZceIZuc8@5tJ+=u zl~Gb@Ai0?D%E}VrU_Px?VjfKu_N6f{k+`Dfq^Zdtphd-6lJ;3!=j}-j&!Fl2&p(RK zfzq@uWHDgZf~!trMqajFMNY+++7~J83$|qPIEOY@78-N8xX>q?rGEstY2IK|b&$zf z#*_Dj05GZX(8i7mGv$4NCFNVoEt&Xg8m#HK{e-gTn;|Dq7vAi1c7|x5yd|1s5q=($ z4l6Y00e*8xGlqVC;9A8{YWHQ+NoJ_eSp~-9z0K=39!&=s)V|2}LxPpTIdOSyJqo%b z@olk8@HrtZjLll@wc+NGyA*9G1-n3ZA*e2-ITQ40$P7_B9yN-E0kN%-DNwcezgwo& z0&KUj#&LfC+CnxZysCG?u`y-hRsIx=f+t`#efCt1puxDEDTLm)n8H&yB*=1x9rTLS zUZ+Gxec;VI)aC1mqBN62j8!f6r z6Wg7N>8r^azw#!kPxY^R_M~i!Y=rDV&s;v1J+&)4DBK(26svUaW}BMJMLHEdacek` zxR4s;@EQ>vBkl?J@_r1|r(e5EKsj-dprxdtt0~AzX3{@eTAa#rf@0?h5jj*|c_v;H z6_5E+ct@gnvSXh5j27vIsi@l-8So5i7A29zOq%+|W+k#flcLeTLFKqIeNyIySBBlw z4*Nstw2C zN0_bJM3SeVbv1t7H}z*|n)9etyTzO@flgYx#OeOw%u8@BvA~lH{X($@gHI|~Iwlgi zkwS6d|Do(1gKP=2b=|5}HdfiTZQHhO+qP}4vTfV8ZLG5ERQEpJyU&Sz?~R)?a(-Wq z962Iq=8uew_nqUhi!+6^jSuCNV+S4-F%EpGKdgc|5ObHTN~~-AoFt}S9+^QA&LR_U zt0Nc<-umeZX&upK?C2;t&*|oP7DH7M7a|dMc}?kzPvfo8+Y5iO)dYS)N|u5|oK+z1 zyZh$j3u>VtR^2(;na!WLNU&h1r{YC=o5r>P^oTUM6t`h^Ow^@tUlw0<5z}V;rjlTW zbA2WG_#Dl@8jtb4qSBYnt~?nQe%=B^*s0py5BrPl8OQ8&3Ihagf}7!TP2tLwRAY0g zi1G2X(c>h)t5$Xl(Ht(5#e=4UUGOmD);M2J#N`ltRW&OIUASZ2kdU0x(3}O_DVl^u zCfokFd5fU=wo+!JzM3*j*9_c!c|H&aV_XT}y}SKrCK<=s{VJKVMefAQG<9l0-b=uC zU;Fz{asknXe+v^Rn0sYnx5X_l7fmLvFE1=`H#FB=eUl4?x;T)ksv$;d_T*2KmWf|2 zI1ZgKrV;t@9~!;uY3DZ`-j7KsF$$!C#ouo-PvFe0R?gBNeuE1q1`(wm>7UG!MTb<6 zd7q&PDC1u)gN^+3D0c&b27sXAITNV&_ctCXp+|$Y_C-LrAA=UoV6MK$hhwSF8YG$9 zs}uNLz%(riRBV+u6meV>T18bumR;UVB2+WCec$sEr<9X^5=)C7Xe9}0S`Byk{U+0y zC^QFR?GAjA4c;wwc2j@hWaXRq@G$Q4d54tFQ55e<}j{U^aI!^Um^Dko1G7AL-4VAp{D_& zP;I`Z0)paVIPcy5?oSb?)&Vak3GLDm)hvAqLSy;f)r63D4ef3HW~pjbzd_asvTD`M zn0uyfqV^XjH2GGKFduS+xvUH0?nPJnbdkA%zpX3pm!Sq$xl*LFeVGNAL;Ah%(%e?! z{=!e#SkN1jjP}mFTW_zDOEYo+5B<`855HX~3H}Pw$eR}I((aRt8xpU0D!O=d729^> zqn>M~L6%6!2=J6)eiu-Dw!fF3ix23pxn38gdL~YxH5qaX-omdI!fQY5}(`A=m6d&ooZFGEO2S!uje8i!}GS%o4D+%{8hY|Sf8-mA98&1PPvo9 z#>iC|exH^GZ5p)9Ef#I2}48jFf5ULac6L~@JBxtMg(coyp^lu)f`m02O~)I(!Jv4h~t!17{~uHa!kM+^HMoQVmR4$W&)YB<_t|vkWX~BWrej`6q@vw)9g9P#~>ad z`jlzqq8wAO*DRJ@ZE^wbV5NZXTY-bxTl&yuG6Tp&}!>2>(r!$z2 zBbV)U5WTmxZsqXr99&qeKPyct>K!L9$sQ@qH}+T!d^|1BvToDoi&SgnyH{2kNamvO zm}lssa*G2Su2mHl_>06HVB*d5%;Zp~OJF796Vi(gjSBMbmbH4YHlg#8XxyP3Rck}M zOiBgnde{vbb;DI3;O$V#N$?oKuYYLo1dLQTjI1b@qu|^k5xdYpxOkqv)1EfAc^8=U z6qvsb9dw`5*eFO05^TC_shYVO*1=2xoU8k1)GK{pT{}1_e~a(CJ)8T z%}i>D^vvUmE=-dOH@S}WZs(TTf2sYP_XDXOx~18Xx}s-vK60N6v7yu^bWW-b(a{@N zI_s6m;-f|R3)kirF{QS?O;Zi8cfW^-?1c?Q&@373=9hqLhyvNBMaQ>Qchj{~>6is$ zaGAQt;50PW+^h6VzWt_xTc;rfx8A&t(6WaJ^F`EN?ZGt8Z0oCTFDduJUje(eccnBe=%SL!Q1!ldKzB0wMy<$@9N zhVh{t>FMd#ieVmXq_(q85CVClWH3;Aa4de8v;MU(KKGh;=to$zCcuycRB~Aku8q8b zN=8FtxIG4!<@&*u=Kusb=+YgBz;o$p@FHnXq{8jF*Y+sSyAeIVAopM6_5L?R;{?{I ztuU5%x4{<#L!HftGbk~SN2uMT4-^+>D_&-ILXP8UZS&N?d_Ydw^+zEm_9*f&7j+;P^t|O=xz`Bla4UaXky+zC1_09^@p{6f!0>qlbAGPL7bILDaE_Kl$N zV~3wn0z13*7i1xHQl2ow)kHxvuMko8v+i`ue4^DGR!okttxP&@T^pGO0mek>?E(1& zq|ePcilQP zd3iwEcZl-!lkSxHG1|1331>5za2b2QPY=5k4zT?^zXwZR@s3r{c-*t2+0!o0FdSY+ z*sMINQq>ox+^@%4#M$L$vM7*;5o8A)2PR}@zvlKVh)?q;{!A?{QR9#-!dRkydK$Hi zQdqWDBr*L-LbhPz;n0a_SmA;v2c}Bvsp9lko_tBxmb+Wy>T-!z=EbSf?<8a zJ;?g|%SEH+ewLs2O<6x)qj%7HMAdspF5>13@Brc7O5mXeyvKg}*IO=}Y``Kh;=T&& zz>$t*vK|GbM4Ah%Zj@ZAwjlKLI_b}Yxyg^38Ooh_JotfxS=_E0bF$YzEhUsN^)(Q7 zOR&2|w$mS!@+52hwQqlD&Mp7f$M2D^8}Z%d08``H+Yt@uIN_B}kj#q42O^r9u_;jP zF-KO+JC@C$XZID{ZEwQ9Rv6^5n8ID;B>xVReRUKl1=Ui*=)NPUigS@;HG$ zjpok@Sw)O=CgZPm^F!>HVp7wb()hqR7V*d%=w9mjAq0jC0h}3Sged%$D{CUx(qAaA ze$OGplw2yGhvH{dN~}|=x)G1s%EAJpw!%d-tVTXs5osuOpfIzYp*J$y(;y1drb`v< z23u8WS=*YDy&cudqed7Z7U=IFY{mG)5g?h6r6yPasH{YZA!iPRSt~x5t3eNGIq>IF zEen89^}I^45UT;U%Zj-S2WR}Y$#VeU(i1S?*)lzc<#UaH;4^xGn@U`kUb!PRKb4*A{rl?4=M zqAQc`D4T-^IER51C|iH<@3_V{cFJb!9X$sj?dXd&mQjWjyPw7Dgd)kie`A{LjEaV; zkXZ;##nEM38_jc#w@Ch0>zCd;QTk0g9AYE{jk=xJi<@lKrT^r!*fq+dQPPspo`hXq zce$u}1bIhU67x;V0id)4+Lbuibdwn=9~Yr@|1qQ-U3@blaJA`7GJUrT>7#bD+XlE9 z%9F5xSECtWj+p7&VUhNcb_Bcq2(^x}if|4AcS(kGPUeTD4g?&`j%M#&<#e<0Py*tb<)>gT4PkEnhs2=aa=1RHv zaR$E_hrch#&jq+$#l>+Hwo$RpwZZ(A0k6h?VeJvUAs@j6pXZH0Y2u83UE9e!Fe@3# z6W=zDCN+_AztrvJ?zd-E&c~`|E#$bW6nCmKcAx6~acl@!IM(xYBp?R{bXl9I1PIxr z;rtQ=y>WWD*}|i_I{%vi`BM_!p7 zBwEN4>E_1YvJ|Lj){J_jByuri!T+0(a?8V5@SQV10|hntY$G}gVS+F8O5{n8r9D9O z!u5kkDuD}Bo9O9T4Pfxyh=Bb3P!UM&V|yXinZ_ON!ELpPm4fH!=?S})x^7pD(hGtZ zP3cg6{f&T!^j}6^@c_YXb+aA)2@tt8G^P@#md@=WCY2@#x|? zg){P-+?mvJIAuU+j*c;@qltZw2hE>_D#3i;TwUisE(M<45Q4|{`qqFJpfazge{?%?-g(4ilI+^X6JaNKKMkE7eR#a&KqP?<) zL)ja*XZDuO=7;5bO2Y3!dHM#3qtSzKY=d$FM=nnAJi0#KGYL!vw%v((qrytBV-*EX zD=1BIL-Xtw>36(PToiKeg%6M(?h{320NC=uSF{5?7KyVR6?M-f<(j_o#>@_K;+b* z-rgKuo!@M5nRy+gGilI=xYrkz%&Jo6+w( zkQdr2;Dc|rkLp?9M2_CgJvl1aAGYxpz&DhKYTf-m#8^is2V;Hff621{RAB!h%hLZa zvi=?X7nzxbiGlUs!GGcD{zGd2Pfho~b?g66sa@ztYFASF9|&|m6uJLDV*8I6{BHss z{f~D1A8I?pKOkMq49x$VhxRl3ukD}j|NHY_`~EBXkNJPD|E%$URQj*TzjFRn>t8kg zHU2C9kMUo*|1m=TYyPkL|6|tweC&Tj|IG3|B=VW!v13e_-`KD4)@DHQNzdZwtc>kpE{d@Zda>&m350{6z zm6Nf<&uFFZWGrO-13WbTKTQN3%350H&op5joPCh4|5!@wTwC<|wzjf?TXwH)`T{lm zcHz-|cyt930hHPPD#3>&W~ zqh|#C5!ujx*4@#urAjSp|L^#LcuRn6?Mkdm_b+4ob7&?P@6>Sgj&96x&Q&0zO)J33 zN&pjB91|HF5)uHY-@l(dOL?KOv)lm;hObpj0Ng?Sq5`>RL1MHP`{pLr#{1_VKY6~} zfaTH^fYRF9*w4P~z`!>Ev#X?{uK@3ct5f(oo;foxHhw8#OI7(*>wF^zM9j=DEyah% z&kYO&_N`6__w}s__+`NE>K|D^$N{zkWN-PK`0Wq_Aw^yJ`6f>Sh{DP>G&;O0=AY|b zpX*;j19w7MRZ00j(cIA3JelyB_P%ccmd`r?2#e-@<_ zDkuQ65C9KMO-3KyK+LqFc)KkLuPrZKpuD)EV4{Anp6GnMl#aXaU|F$|b)4|K=a--l z_pdGcJqTEgjJ=cNi^zJv8f)uA-$~y((9FI_eRzFMDP{d=7C8CS0dJH*J{*g_WrP{Q zJT+^4C%@SYM9}5wVT6s>doXTYsSpsrWc}c{G5vCa{|VoN*xfw_c=>ciA~vk(u`qfg zt!(US1M+*HG1}>P$PL>1#)1P_(ghNNdAU)8guUqm`BnOr--h#>Gx6*l^Zwhm9>}NT zgV+AUHR)S6<9jBGXw%C2`xId*cj~q0Ec&X_s_U!X?W8L6;vIUFvr{(U^P6c2_$x;r zAen1+a(wFA*P=)}ddHN1b$dU1#+P}1T)2Pyw^^~#k%h%880*(%&D#xoLsLB%1$(R7 z=Qb69$MG}u-ALJIm1nN9V&?PJ-kr=CuTt;x+Ut!;NmPi#^kP%qh?<^$G^THdZeb0%( z2b{-B`bXflVa8Xq&vW$LrS2^^>+Vk60jq<9=T6~wSwea^Nbid%?iX1``Wd%pWKWaT z4ZFGryGA!{w=dSVvC4PA&DaO$rYDQbx7kiERn+&Lcdm;_=s)tBZ)O5}=o#-lX<~c@ zo1c}ithcnjQ0N#N0MW9)L3S8lTb@2&w7!?SXHW|(iWqNq_I5f>pIQKajyS&l=1X~2 z;{Hkd?E3Qiro~cp_SW?po;LAKyk^yw-pb0}dA`M^yvok~?(?Pc#+&_Z2HDbns5g#j zpp85KdaI5$6X@d`;hW>7%b1<9ZMyVYvGE(x3DQ~HI$+iu(pcl^!OzFae(xj>hv z#5#I*?@D|ed#0N<1cIXB9ovqQQB81RsH7hz5V-c`!9oD6yC=(wIpC-3EMJH5=zaEP zYg`iK94;3q5jQ3%Wsh_a^%W)VqWx%pkJ9wqNU7)rNHTRCYT6;yejH!h{O{h`JZ%pG z+Rg2$!XxP(gfXJo{@3iG+LV`5tA;@y8V_5gin`W^-GsgKU3kBp0_jPGmoTiNaXs4i zq*M{xzqBU|@{E2~($I*nF|NOJ?@!0Y3-uY-IBFauW#V3)ndm^x9|?O+ZYnU?=NCD1 z2&fNcGd$hI#84=jQ()1sqJjptMcEcI#yUp$DcxSNE-M8#EpoTVb8=Q>FFaz;GH&|& zuMDbN!mPqRAzkkV;zIj-*muwWQa4YR>d-b_lU-*7y|IFpR0OIv-v>ZsJJGC}+QPwokkD=% zo++5S{BA@cuv`do#GbUE`x7PBPBh)QUo9SFxVR&a$jP{E71B6DY(Wna3 zsc%NqiDI@Yq|KGPU`z$gUQgBLQg-i+tj5x})OjvXKI2YB!#?vdy8@~9DqPyVRBv4J?z z&C^b=39jf8?F)ise-~?v;h^3%4uZgY9uMcv!Zx@>wOHCvxh4H=d)S&U2t)1EqMHZk z0m1YLy20a9(=`yg4yz5$KxfFQN50F_XnoxNl>P}_7epXj79S;qL#ATqgDPs1sQO{a z(KkuM;AA^U{W5#Vnh9P?3VdOP+hLnQQOW2zjbL-{v7^TdS+CI{wpyIU<#uFUmkNS& zDg)eWyAKpvf-UuSMfftvUXQZT6DOrFjDcYzNQ&gT+EJIYV$YD%{Ldy{W=2=u&?3bZ zXA>JID(5P)c6!a(I}*bBB_hw0WRv8dzYD@oTa^xyB|zsPMZjk~hLeH^6R~}l2XHj^ zt9n0;$PJk{va>82BDbq4)?EON2dHc4s`L0dtFXKu&P)SnbRO?RSx)QRDN^RQFjZus{aGn?eEArQ5$po)beW0V|$4 zDmYZXOJvsyM~fuA)ia-)pFM1ed!!@;G|h9yu7mTLa}Y}H})t$>5aYBDIk$Es@v4(Ddyh8+5*@&v|SIDF~>prI6KEQmhh5B zLo6+rsT6uE@j1w*nu^am2{qSwioP@;=G+TLSoexPqQ8l*D;T6wq$;q(B2)olCTis( zc4doz%jNaG;VzSz?NOBP8eV3aN|JvC)h%=L^~ z$}I?u9?AY2Zhwm@C|oAhKaakt%_t6|4#zHWR~|>2)Zi1KBL7!x3{hcA;XWkzsoD4^ zPocFWP$>^Ea}(Pcg=d*8-$vK5z+z2e_)6XL=c|YrrmN&m;5^Vx3M)X;QBVwmrkK(P zyz=^_HJGb%?%h&{X~hKk3{o!M((Y(9+4QZqK&hP(m^+%#OeKkN8&n5(5$M_6mg|r% z$|@Jk8U*3f!JX=cE&(Ihsg1+Lq>J?$x%99!OV+7E(KQ-Cgm`XjE2yg;GScOZGbm9F zetz~Q34~@y57iiHp!3#)1{W@E8ht<@Js)S*`vSC=wD5-cjR4LCr~w;K6l)*7?&3Q; z9@uz5mVD9IEy5Sw0l{~6uNb^vMBE0`b!g;%q)T=*Xn=|hU5kpyW9RVZRSqd!`C~nS z6}D_8;NSZ~7|!o8c;OV6zI5ch@{yTZd)-TitSma$xpp$*yk}R4FxpJ|5fMaOSK~b7 zaL;m`u29I3_sBu?(zb}82JL9Gg<>oj71;!R3rl#QA4Vn`HvD@x=px#}qD%&M`M`)E zpCV(c#DOLUF;$NU{5+li38yx2voA#DZ#HQ8UM!VRY$I#}jPQ+PAsp~S6({Ru^$l@; zjfmxRl-ZZ(cds393Qa5paBHYS+Yl1u94C-_B!j)3EVY&r#o{B7;TZ3Bgl3dpVa{8tM201iZ~m`;&z zM;Tgq|=inOmj=wgn=+ z3#X#cLJ_4zB-f4}QY*^u(r49UY6e(_NtB)wJn3#~lMC+(ZJC$>T_bHPUBA3sOiX}K zWQTXBYef(Xn{iQCYc9-{4^HX+~Bc&bXkzvwww1oBxgLKxiy{VkK0-l5~ti+Iv z!z*y5`4N$Kj&-M{kfLAfpBZ-!N%J%?GO1Dw1Zl~MNKVgNLPWM67Kg^^6Rw|`hfdeW z!*JZ&F66*$@A=qdiYZ}L9I^yeNRe@3 zOE!7ct|mBkh(wEUbj`-%*-u0+Apkbg-$;N8|IwM z21Z{oF@hau@OcOl9@Zd`G_Y(oH8S%7D5T&C8qUMZ(GOw3r@L9!gi8^e?RBTFE&-+) z;EaftgV;k@^GiFMDIj1$cP$C#z|EfQw1uireDt#J|Vx%6RjY9x1VW~H1YZY8t zsD01{nl4H*h|0jFq+Z^JUiZAN1^)I16Ibp+VoA5gNd0tb+c!ZKuZ3X5wP zeU2m-SCv)o{#J&zJ)$dCJ zE5<&&P*ZG`JVmn6y%TXz-;rt5t0)29kZA`$(*Bv8Ru>HV0PS5Z&iV72sg*gUC6LqF zhDy=ix`?JVQ!lc&^*UM!=4+5aMhe!qODE3Fqf57R4c;Tr9D05&(^UaoCl~$a(_d=N zmaJOx7*pOMLoIty=0p*kuYEwE9Hj^y6(%9uAaf#^U?5pzmfw$x`j6+`qfpzetQ>AnVL6)2Cj~JI)IH*<* zX08Gli0<7GWc^Mx2S?hLHX!EQqYwB`XcI#)Mjl;%Mn|~W1rk9fb=om{FhLjl_4pbw%;pm+apyv!>0}Gvt_wW; z5@auh3|vH*mes`2P+@(srwgdI{L*3j42_rvMrcfkM&9Wv#yIs+u;e+?U5_vsek5$M7rQgp~5yxw@N*fch*S2;i(Y?>0SGcp7z z^#(k^V>*#(KKdh!zbV>KJ>Ix|DEQ3b5(QeD_+;gRckGwe25B8h-jjg5nkl)&S_MoK zure8p&e>7gJ89dw&+P0#3iP(7(9K*f?w#8c)B+%8E83plF+qbn41qE61jqaA1dXq{ zjzh4ue86qu1s1iLsvBDBfTzzbf}`~-lGdXmT%8|CU#M}3N^TxMZ#l2~MZXNW6um@N z7+CaBSvoM?(1cp;FxiEwQ^z71W4@h)5w%thiWdLXDw+4=#pcca)zL>lxvd2;`p!)_ zO#talod-$H#8CunOR)A`_JL zK66W;l~Tf74J52TM!QemiS%N;0@|XrZgpYu)nk40_?Ld;o_YZKN&i+1BrIfEx{>@f zF{i26j*rZVM`Lb6>>=0Q3d3~Glvk#bGD-rdH&8UpmvfH9h#aVYeMcr#>uT4=y6k$s z!GSmtT-BHnTEH=NR*<=6x~5O5b-2cXC#)z!=uZoRq>e_5d_LW2&2O$5Fq`n|LE_Rn zB0k6v%+0~r2cwD`x)eOSO~~rfR^agb%|2`!m0~cOL{A2P9J^wU?x`-cIHLl>_J$yp z_=(<$*jq=UXS|um4Q;_rTs80D-<`xRjceCT`Kn5Pi-zZoUB(?^%RYDLMjIxioI2!@ zLEdXX-RbX@xcXbxLxwC@BZ0*on4i6Y+)*uy*l7%U6PkL-9w!N6jc{$XKGX{Ay_15= zxOqmiS_LevM+`LT+J!6f>H^A)+3QHtUJ&1Md5G_@8@QhB_A|iWgF%msy~yGhG~^E1 zyiA1hff@JlGjv+tAeUFS(&S`TVI$m!j1j0|BwQ%PQ}0;tZ4Zw~-4!yTxk<#VGg@KW zUKyG1qa8(2a?&J?E-fpmPss*2ex(gi?=h(?6lexv3tZcO$kU}=D-l>fy?T$B2%jHoVu}dm(hATxz$@XStMpKO zt7v^ky?_<&NRlJDoK$Xh1&HRmleL1@-*DZ39{`9;?@i9QX6n**DDAGKWbWawS~+BN z1_gg?$2f1rgnkVb0e!}HD>qN;cg7#NT4kb`0gIp{w|<8)=?o$5*+A#sV~%Xy0{xr# zHf0y%c~Vf3Ul7V2(*-*eFcc<=`{S-NB-ZixRe$$uRr4gl`xg`~rF2Omf|EPh>(bc{ zwUx+Y-(cIUCUD=vy-HLRAre|NR{nS#T!~Apx>{xLM6T9|ei@oGSWx%-T||nF|0yP? zqnGYtyD;>eRT0nyQgj3`p!{I_`ya0-@r}}tS+QZcKQ@`O^;Ugn=V;kb{%NjBdmbKW zHJK)PnlfMA9xo&MDdMgUWt~(u8FjJYfmSIG{-2c|VS22r*F5~;ua z@y?Brj}$@f087so5&|C*Lbd@OSiLKVnIq#%(XUFE`e_ov&*=eJCB&Vs#SUUmyhHFZpLrZ zG*s(TKrRqP*3IPiAp z+HUn{-0HMLoOlN00=G$Er<1P(?B&>ntPh$)5{$7xxL8l4Ns``kyKp)|=Y4+9y*ZEV2?tHl&EU;~$5nSxNUEyczu*PlB#S?eVp3e~P^IoCp0(u+9U@n{;)FU!Xa%-fl!hV#2H( za&On+u=4Dkil+TEyG>}3M!_EipPVgWL~g)^J*lfRW|8noOSeyB1U> zWrwU#WXf#8*OOvwm&8D`Uw55#;v*B`l%Qo|{D{+&usmzBoOD-*)0AS%1PC4^vl9i$ zcc%a#zi(BPdEIf1d5I&XuT$ba8GPj~OjP&T`Q}exqN;-GLerb^coo=NPwh+m{(2#& zwMt4kan@0=nj$y5>Q$>W7KmZ+PB6CZ_0?mGaUR!3s!Y}Sl+hH-=g z9%cJ95@(@N50LIEZ>YziQHQ@mj`{1sCoumWm8VEhjO62DE8-qYgf|ByRyf{T5njN& zzFRG|HRe)ZC`eiKUqbUXuU=+s)r_9d=;1+Wx(1n=S#d#94k;nx8dd{M9VGL}iDL8{#m}goF@1 zLDL(oek<}5T-Z5N#F@Ignkh8e&U(i{+axVfy#Dr6Q0j;v9C|dwg&9mQE+KjDMquST zD{H^jK@HVO8#D^Ou0l=XxE5I1vL9h?r^nE%^+aK^kp^meycK+7ps)TVw&7If^)L0) z++WPQ_655bu#(0>^mHJyT0Y?B`19txi=51kb5{Y9T9os#d?(0zHLNM5qt>) zZ4;&tNZtFIFn84Lh}>M+uUR!vQ}ORlHTA~Jn%>>0Ws}~RU2u~4{P%!hE2=?pyQOJe z*Q_LreY}9dzeedSJqi2)!pbWWC8yir$n1ipUAJkI*=81ZDhycV_R1Gr1qailw)kdw9WrHo6zgco;a|QAM_=rJ0JiH5eQoy( zv<@9$3&b(((JLBs6EP>J<(C2r*uRH=cSSI<-Xd*auzZC8jT`{9{vmq9&2E*&ip%pA<1{7|SW}EUirep#`H%R2nWk!u(p>T(qr`x?%imZABk|$1xBOlN=~4$&2c<6N8!9oJ8p-^m`#QJ`l!w#QL>} zb&lP|x#Q0twX(O37=6{>hGR@H5Sk{cLA>VVmN{&Ykq!Cq$9T^Fq{GlY zzF1N6uP*}tZW%{t2IzAHZspRgRXmqWHK#H4%WacuIZcy%Bs{PE!=AjG!ki7wyRu}p zTpTJSHWYm|&+BWtlbveo`Bsmmy=Xe6o5!%CgwHOu^AS#@1!@1{m>nbZx5$%IeL7C#S6 zj%E3hELMd8+{o#D^TnfSE_cuZX``e=LX^RjWWygrhLL9KH~eS*j)h9*Jj|r3jAr-C zht*iZgz|XrY1KLh5#1bjR&}44yjH53aJR$+(J{YP0!f=wJA`5|4RbV)I*s0p4MLH+ zv3kE7N^O@UhWYNSr~>y9D4f}+j`sl<0LtghCIiNzHAe$(Aq=fvqSoTMNbp5mCr8-4 zNdX?w2C*1AohwMT3*{}S*<$iIh2=XI5^%K8I{R5ZeSL)nfBOtbD- zaQ5~FifOcuPm`@Z-L0;jyi3>I7j+|?mJs^|?v*W$7eU-DbPjXPl)&xHzEkNJ;I=a2 zbf$j#YGLJVqe}~6vB*@vZzI{swYK-w51eJVWOm5$JJ^T7K2^FGl^$E+T_LhCl+3Kd3VH^uI&?@V+{0 zxs9l06^WgzDG?H_@|G=%zdy+>ROW2Xjy!`M0Xl-P2dX2ST;h}q@g6PegFkfc92+it zFh{MRWZ+p|H!Hf$2@ulc=Lb8o&6^35u0&QQi|I>5kA>cd2p5Cy;LwcX19eV${UIu= zR%Kj6ZHE`)^-k_Fg~X0Agq*gm{X=-mKe`ihkY+|?PnzR8y8|=Ynnb7&+G!XgHyT}T4sMxRx*bbYm%kJ{~5k9 z3kKa}!OMK50S;}-Kg|eRIY$%Gx`AZ>Nt-*g{PEgG+z`2i--sv_{oC2S6#+|d9l3Zb zjka1|1Dn(JXyj4QywzS`6Pl&Ytiwqx-E)QRHdp^vR}%Q+S_-Ba$6Q7x?6M}C)wWXK zK+?YUMVw0eu;@s|>|cVd)L+2ab+I62?{Mi)-t| zpdLuITUFuNRoo%LM`5=jj zyRZWL2Xz)s#S!Uah^#9&V%-Kg7mglqgBmctGjVA#pEP9=zC7fNzZ$??uEQpT#!lWY zKKdFFVOrnPaY_EuCupWA&H-vP)kD`?oU~$yM-)dGwTa^b^O7C~L`7^E6d{1di8oPg ztxlzTdiUD3T#ifbKs3D28qLm?P?{#FtKO%LBlO&2M15u5-(!4-unQ2P8P5B=J#4kM zFSUl9hCI$HzJn(a*f9c*FMtc56{1MEJm3D5bTd`iWY(<6#53!{a$`-E5>OuLlzVy! z8W^0TXbrSKmXLX~E@Mw)vt-x%#@JlCdeeQbdlJ$6Zx*a3Gw*aJyY(t4wFz5I*v&X$*KF_;^F;r^Q;=b70~T9j zb-tu5Jp2*HSt#YcWw^b|U+m|gbk_*W_CSZ1xS?n4vV6Wq@O(P$6t+loR;M&bL*jXv zh)4H_2lP@JM(|E#6}DH)<7bL!E0%i0r-Z-1vm_{bADs{c%@e>5<0sKiLSIbrU_mvw z#D@~!q_(1eJMgMFbEb=5dF>`l{w)$d6@nx=cJz`nHh`LXEe`eJOGDuH4lMlRnD)Ro z&|{o4SKfaYm2QQ?|2>I;d^bOv;>gk7Hy#Tu4qCty zLAkWmR`7_&+eoG{Im)zOI0aPRyhpE7i?U|xx@A%ECfn7#wcV3aW~u&~9KN7Mnht!M zv*Fe)V+lL7^7OGyMrgs9`Q1U<$4M~90M(Q~LY>~g<{9xL5f<8mW6KF+FDe;OC+f5T z{CbU1dC6w4UJ`~{>f9vI_17QZW@lJ)zBJij(I~nAVQayy5-h)@6W(S3<_^}`5Wcb{aSP~zrpODdH0H_$JGmOQ(;J*_1 z8k2HvR@`BK5CDCAnY6|M1@xNY7uA@57DSVXKe8pwJ_Aj*kKk=Tpxhm};sXSI+_BU% zO0&FQV>mA5BGr%Ao6#!PQYIqH1s;+8K;53{%>1*Qg6QT7o|*tF)BNU zE0%_dBDcs`qLOsZOTj^fyn8MBFg}b|BNJI|0+VL&47gAHB18U?0Hs}arY+JiKAn1g z9L|Qxy8eBxJOl1t9Jt#w$>vCEmmY%36xAOV8}}s)vVfE_t5>{+s^=u%4Ud>LmG&av>TSEbQov`3MC^F{D{FWAgv`OU zCm$j*qG?o*X?0J+i@*Z{S%$6WayS>2}aazz-#sf8sP(zYM<4ge@w< z=3bUECL|@TAb6AA$ifG%M$<I=q* zv$!ipH0`ItOK+G3p%f_gw>fb)GS&nXSc|Z3{tF(bf7;dr{_j^e2OQ2(->#K4GD@Bv zhGBr4#J^{i{cMX`Yx+g=!%QbWRDp-$#6WF*TD1D{ysTL5cXke^eRY6S4-&4@nxiUO z`IZE&Uel+IZ?(Nre0% zFohuQaUp+B9AH}=RBD;0!ZQoWbny8l>9gXNs77tBH}YG|d?DY!lS)>nUEx#CM`^Ja zy5mxTrarp524--l;~lUhov!20U15HUJLAO&#Mh73%8Pfv$5%Wj?J@d@8C#y}3L+e0KZNxnfQNUhnk$)n={=K$zS{YrpoYnE$nH0iBv#Ylzu!#PcO%83v=R`(JWDGyAA z50u6jn8YAB8c^`dYG${nV}6|f*A^9f=0tCV6~dtXV`3y3921Mh;46vJhd?G;kMzh5 zHl!Lb#!!K8`59eaq5jE0&C*e7astin4WC0p`e629&n7`zZ=dZ@k6he>+T=CHu*!|s zFkIK7o*$SwZ$jQB6tNb~alYDG`!t)6clxrG*jXdG^)QU&4)gFz{fsS9i}|BD;^*Ij z6F{IL%xToD6oF2KI|#?AXg@GjFR4HM&Nm%hgl=Bf&Cnhm6AGVh$-Wc2T?p*;uKZFZ zN8rM)cn~W#^<6yGi)J323#5M+L>4v3X|#u4bH(;6Y~i|}`#y=%*o^EuhjbGs^4Lkzo){^;^?;=!x$+-KG9++as574%C~Aj=Npv1aRBuTJc=!O_jE zqgX(4zN(-1#lUUl?hY8ILbUTXw0jGqjq6oBr4RZh&{-!uf#%i9W>unfXHDz?s7Hdl zlDT559Y;XOHk$5>*izQf{6!Z7c3VtlE*V9J2Bd-bK`YFlzw%ClG_M3QY`{N9_PgA@ z5yO(&6FN5meqcXYn@{dC$&v=iQe?7Z7aAEw zmU^iaPY+qklA%eKi0s59ChzT8Iv#qC_q~tn%l$vj`#SIQyw2<6?|G1V>7M%XJtyuW2PdoTReP-5Rc-G`h< zPQsz4wvJLs47#*;Z(4NOOivGj_+Dtjtkh|Fx>55ex%&hj-}>9WexH(ujgq$)DU6ay zrd8A1>%oB&HatQ33mCx|qGpiNdlxod4EIVSpquo#v&X)~>h(Jgdpu-U&yJAd6`tEJ zUW4~sWPQEGi}aEuC2w3`y(N5V=+cz5wxIQZmAVnii7#A!Zl|L7TB5n=;g314aH$qA z&?VhOiMJD+lgZ*_$wZ@JVf%#>A?0QXas#T&c+*5p**TM%wa$sW`u52%DIvadOUC6s zEiWWbaPx``hHr>+p%i;<;q|5d z%5s<}dG%116K2&Cv8Mbx=}33=k7{4>E~Zq7NQ^=A0Wt_;IASq(Qu_3RJ~qa!^)BG& zcF~%6xI5Ue(}Eu4SI)>z8VDu5Bx1C$)V`(8yYxQi! z6(^D6;v!!EGZ1?A9ksmbS}Vgvowzlp$)T%9mK7v?I&&nIXPjTq8dvr()T6OjkivHo z3Su55y*yrBAC~^z<4bPF@!8eLvJiHe0~{kbn$w;|I}(L$GlAMh@ZGh~z5IYBE)k_6 zw`AJuJM|<;gzdgSI5`GO*0UopYg(+>UR3;Y>C7A1+SY3+l{I)dMFE4%3~ay=ud$1t zPDeVW`MbAYX=j&%Wa67$Z8gTUugj-pgvTctTjRW}eV~iVyh|OdYH9y&;hnLjLGt3} zZbgiFoRmdzY3KMw@|n1&C|fPbZ6rZ~14+7zulEnAdqw5Pnm}a!bKm}5iZ9dKKN&~U z`5;l9Axke)>ykUSPml)3>usyp&EiTajKbWEt;(W_os=4f4oPPNjsvZjvupDx5l53J zmOH18KRv1N+hyy1<=X2pjm7g^V>D46#>31*9n{*9QcvDy*4X!n)*Z%Ev5^iaHpr9A zbH|tDT4z`srtpLK9wFdSF(NUvQowwnA5SNoU(^qT5suyC^pRAx(4%euvLQ zoq*(d$Evp#$vn!<2srI6VFV8?N^%idx^LLw(%LNyq zeLl5Z48E$gj&Yw55TRn~!SB(iC#m!vsBDQ{(?z4pq65`6C<9pR2Px`y99*MkG8Q`v8>2iKr| z@}HW!-tlRvSqbQxMv|))Z+9Aob2;<6ElwUYV1CHf|Bz+#qSb3+*CU$V6uYLb^({py zkp8|jD7{OrvtfF7T0^M1eej`ouK2tiCzz?E6zTk1iTk#7JmZ;=cq!!U$?yw&X?KI& zI%{%J#BNqIy5}?RYsWK|^H9M8ucOej#&E5hd)SdT*D%xG$1lGWV2|7oZOcbUB==Ix zboa~Z`sJEC#+r>5H6EH4gQ~};jc)H=e0UU5Gnp4|6&fe#qk;ZDmA-?W z5!L<<$j(7HW4WRZ(i&u%|E;X0 zAr&|smDQSU5{}y~hR}JOGhG}A^^X(3;r+%_KH9?5N60dRy*5*j8#d@P)j3s|O9_dy z0j+y}JP~ox>D;1CqPvW?jBmEML8H2SI=F95NT0dqtMyAKC{57amrCC|X@S~X64_gY z{1Q9b@xsvTR)c-`B=VKmuIB+6U0*rmDXYwE-viHiVkIq4C0M)GW$sK#{AN>fWtRIR zVm-#j{f2_Qd)jlYTcKLUB4-AXKW+~Sb9Wa&ip^{%6XfI@U@_Y#Bf8r_Z(%vDen!Ia zsk34R*J?6d6nLsK6;c~Q?e1h%p8t|$Dw^s;a~KKILQol42Ma;it)*S{ub#W~acz6A zZkw^}T~##Ear!@PYu1<};|{tUHA`;exOGZ$+89f_immMAvu-?HyRg1*tuJrg^!qta zeq=y^f0;@^b;N2NheWfT|G$8l8(7;PV5S-Z`IEu?X@ZPE9YgO0W25-jzqv=2zwap2(W-SQxf5S zhio8s0Ft^08~O$}(>CQSv7KmsV@CJo4o0O;3Enu;0%4IITL4Te-z z`!z2d4jkmBI3O=_YhEBq<(Is0^e=S)kR^I+Ux0sxZS4V|Z4i{3@~Qx5_G=tUb!%UM z+BOi{&EEo(xwQ@;4z;xhDrzY7FFDlI0RDILeyS>J$NoiA+gcki=J2gO1vG#>{kb36 z5AWek^yAy`0Jri8CIWK>R<}NCxiE9}}&*aV7ZV ./bin/performance_benchmark -o benchmark.tex +$> ./bin/performance_benchmark -all -o benchmark.tex [...] -=== Memory copy (native C array) -... 100(50) -... 1000(33) -... 10000(22) -... 100000(14) -... 1000000(9) -... 10000000(6) -... 100000000(4) -... 1000000000(2) -... 10000000000(1)[failed!] -... 100000000000(1)[failed!] -=== Memory copy (gsVector) -... 100(50) -... 1000(33) +[memcopyCarray] Memory copy (native C array) +100(100)...400(66)...1600(44)...6400(29)...25600(19)...102400(12)... ...1677721600(1)[failed!]... +[memcopyEigen] Memory copy (gsVector) +100(100)...400(66)...1600(44)...6400(29)...25600(19)...102400(12)... ...1677721600(1)[failed!]... [...] ~~~~ -By using the `-o` flag the output is written to the file -`benchmark.tex`. If this flag is omitted, the output is written to -\ref gsInfo. +By using the `-o` flag the detailed output is written to the file +`benchmark.tex`. + In default mode, the performance benchmark runs each benchmark for a sequence of increasing problem sizes starting at 100 and increasing -the problem size by a factor of 10 until the total system memory is +the problem size by a factor of 4 until the total system memory is exceeded. The latter is indicated in the output above by the trailing `[failed!]`. We will explain below how this case is handled by a `memory_safeguard` mechanism that detects insufficient memory without -trying to allocate the memory in the first place. The value in `()` -indicates the number of runs the particular test is executed. For very -small problem sizes it is advisable to run the same test multiple -times and average the result over the number of runs to reduce the -influence of inaccurate time measurements. - -The outputfile `benchmark.tex` is transformed into a PDF file using +trying to allocate the memory in the first place. The value in braces, +e.g., `400(66)`, indicates the number of runs the particular test is +executed, here 66 times. For very small problem sizes it is advisable +to run the same test multiple times and average the result over the +number of runs to reduce the influence of inaccurate time +measurements. + +The output file `benchmark.tex` is transformed into a PDF file using the command \c pdflatex (see https://www.latex-project.org): \image html figs/performance_benchmark_memcopy1.pdf -\image html figs/performance_benchmark_memcopy2.pdf - Each group represents a different problem size. By default, each problem size is run with 1, 2, 4, ..., `omp_get_max_threads()` OpenMP -threads, which is represented by the different bars. +threads, which is represented by the different bars. The above plot +shows the speedup achieved with multiple OpenMP threads for moderate +problem sizes (e.g., 1 and 6 MB) and the saturation of the memory +subsystem around 60 GB/s for problem sizes larger than 100 MB. The +figure below shows the same benchmark but implemented with the \ref +gsVector class instead of native C arrays. + +\image html figs/performance_benchmark_memcopy2.pdf + +When running all benchmarks (`--all` flag) the output file will +contain additional plots that compare benchmarks of the same type, +e.g., memory copy of C-style arrays and \ref gsVector. + +\image html figs/performance_benchmark_memcopy3.pdf + +Since the values can differ by orders of magnitude it might be useful +to replace `\begin{axis}...\end{axis}` by +`\begin{semilogyaxis}...\end{semilogyaxis}` in the output file +`benchmark.tex` before executing the `pdflatex` command to produce +plots with logarithmic y-axis. A list of benchmark results for different computer architectures, compilers, and operating systems is mainted at the G+Smo Wiki. -\section CustomizingTheBenchmark Customizing the performance benchmark +\section PerformanceBenchmarkCustomizingTheBenchmark Customizing the performance benchmark The performance benchmark can be customized using various command-line -arguments. One or a subset of all available benchmarks can be selected -using the `-b` flag, e.g., +arguments. Individual benchmarks can be selected using the `-b` flag, +e.g., ~~~~bash -$> ./bin/performance_benchmark -b1 -b 4 -o benchmark.tex +$> ./bin/performance_benchmark -b 1 -b 4 -o benchmark.tex ~~~~ will run *benchmark #1* (memory copy (native C array)) and *benchmark @@ -110,81 +119,115 @@ will run *benchmark #1* (memory copy (native C array)) and *benchmark The problem sizes can be defined by either providing a list of values, e.g., ~~~~bash -$> ./bin/performance_benchmark -b1 -v 100 -v 500 -v 1000 -o benchmark.tex +$> ./bin/performance_benchmark -b 1 -v 100 -v 500 -v 1000 -o benchmark.tex [...] -=== Memory copy (native C array) -... 100(50) -... 500(33) -... 1000(22) +[memcopyCarray] Memory copy (native C array) +100(100)...500(66)...1000(44)... ~~~~ -or by providing the smallest (`--vsizesmin`) and largest -(`--vsizesmax`) problem size and, optionally, the factor (`-V`) by +or by providing the smallest (`--vsizemin`) and largest +(`--vsizemax`) problem size and, optionally, the factor (`-V`/`--vsizefactor`) by which the problem size should be increased, e.g., ~~~~bash -$> ./bin/performance_benchmark -b 1 --vsizesmin 100 --vsizesmax 1000 -V 1.2 -o result.tex +$> ./bin/performance_benchmark -b 1 --vsizemin 100 --vsizemax 1000 -V 1.2 -o benchmark.tex [...] -... 100(50) -... 120(33) -... 144(22) -... 172(14) -... 206(9) -... 247(6) -... 296(4) -... 355(2) -... 426(1) -... 511(1) -... 613(1) -... 735(1) -... 882(1) +[memcopyCarray] Memory copy (native C array) +100(100)...120(66)...144(44)...172(29)...206(19)...247(12)...296(8)...355(5)...426(3)...511(2)...613(1)...735(1)...882(1)... ~~~~ -Here, the `vsizes`-family of flags refers to all vector-type -benchmarks. Similarly, the `msizes`-family of flags (`--msizesmin`, -`--msizesmax`, `-M`) refers to all matrix-type benchmarks. +Here, the `vsize`-family of flags refers to all vector-type +benchmarks. Similarly, the `msize`-family of flags (`--msizemin`, +`--msizemax`, `-M`/`--msizefactor`) refers to all matrix-type +benchmarks. The sequence of runs can be specified in the same way, e.g., ~~~~bash -$> ./bin/performance_benchmark -b 1 --vsizesmin 100 --vsizesmax 1000 -V 1.2 --runsmin 4 --runsmax 80 -R 1.3 -o result.tex +$> ./bin/performance_benchmark -b 1 --vsizemin 100 --vsizemax 1000 -V 1.2 --runsmin 4 --runsmax 80 -R 1.3 -o benchmark.tex [...] -=== Memory copy (native C array) -... 100(80) -... 120(61) -... 144(46) -... 172(35) -... 206(26) -... 247(20) -... 296(15) -... 355(11) -... 426(8) -... 511(6) -... 613(4) -... 735(4) -... 882(4) +[memcopyCarray] Memory copy (native C array) +100(80)...120(61)...144(46)...172(35)...206(26)...247(20)...296(15)...355(11)...426(8)...511(6)...613(4)...735(4)...882(4)... ~~~~ Here, the smallest problem size is executed 80 times (`--runsmax`) and for each larger problem instance, the number of runs is successively -reduced by the factor 1.3 (`-R`) but not below 4 (`--runsmin`). +reduced by the factor 1.3 (`-R`/`--runsfactor`) but not below 4 (`--runsmin`). Finally, the number of OpenMP threads that should be used can be specified globally by providing an explicit list, e.g., ~~~~bash -$> ./bin/performance_benchmark -t 1 -t 4 -t 8 +$> ./bin/performance_benchmark -t 1 -t 4 -t 8 -o benchmark.tex ~~~~ -runs all benchmarks with 1, 4, and 6 OpenMP threads. +runs all benchmarks with 1, 4, and 8 OpenMP threads. -\section ImplementingAdditionalBenchmarks Implementing additional benchmarks + +Instead of writing the detailed output to a LaTeX file it is also possible to create an XML file using the `-o` flag with a filename ending with `.xml`, e.g., + +~~~~bash +$> ./bin/performance_benchmark -b 1 -o benchmark.xml +~~~~ + +The XML file can be opened as shown in the code snippet below + +~~~~cpp +std::string fn="benchmark.xml"; +gsBenchmark benchmark; +gsFileData<> fd(fn); +fd.getId(0, benchmark); +gsInfo << bm; +~~~~ + +This will write the benchmark output to \ref gsInfo + +If this flag is omitted, the output is written to +\ref gsInfo + +~~~~text +[memcopyCarray] Memory copy (native C array) + memsize | 4x (#Threads : Bandwidth in GB/s) + 1 KB | 1 : 6.10e+00 2 : 1.17e+00 4 : 8.77e-01 8 : 3.17e-01 + 6 KB | 1 : 1.17e+01 2 : 9.47e+00 4 : 4.70e+00 8 : 2.62e+00 + 25 KB | 1 : 3.15e+01 2 : 8.28e+00 4 : 1.35e+01 8 : 1.08e+01 + 100 KB | 1 : 4.54e+01 2 : 4.48e+01 4 : 5.06e+01 8 : 6.72e+00 + 400 KB | 1 : 3.48e+01 2 : 5.94e+01 4 : 7.62e+01 8 : 8.33e+01 + 1 MB | 1 : 2.96e+01 2 : 6.92e+01 4 : 1.17e+02 8 : 6.41e+01 + 6 MB | 1 : 2.34e+01 2 : 8.51e+01 4 : 1.17e+02 8 : 1.12e+02 + 25 MB | 1 : 2.78e+01 2 : 5.67e+01 4 : 7.02e+01 8 : 6.77e+01 + 100 MB | 1 : 1.51e+01 2 : 5.85e+01 4 : 5.72e+01 8 : 4.59e+01 + 400 MB | 1 : 1.51e+01 2 : 5.95e+01 4 : 5.69e+01 8 : 4.59e+01 + 1 GB | 1 : 5.48e+00 2 : 7.31e+00 4 : 2.05e+01 8 : 1.76e+01 + 6 GB | 1 : 4.43e+00 2 : 5.71e+00 4 : 6.26e+00 8 : 4.03e+00 +~~~~ + +The above output shows the bandwidth in GB/s of the memory copy +benchmark for different array sizes (rows) and 1, 2, 4, and 8 OpenMP +threads (columns). + +\section PerformanceBenchmarkImplementingAdditionalBenchmarks Implementing additional benchmarks To implement additional benchmarks, copy one of the existing ones and adjust the constructors and member functions accordingly: \snippet performance_benchmark.cpp Implement benchmark eigen dense matrix-vector multiplication +Make sure that all tasks that should not be included in the time +measurement are performed in the constructor. Furthermore, make sure +to instanciate the `memory_safeguard` object ` _msg(n)` first as it +will let the benchmark fail gracefully if the estimated amount of +memory exceeds the system's total memory. The implementation of the +`memory_safeguard` class is given below: + +\snippet performance_benchmark.cpp Implement memory safeguard + +If you are unsure about the exact memory consumption you can return an +upper bound, e.g., expected memory consumption + 10%, in the +benchmark's `size(index_t n)` function. + +\section PerformanceBenchmarkAnnotatedSourceFile Annotated source file + Here is the full file \c examples/performance_benchmark.cpp. Clicking on a function or class name will lead you to its reference documentation. diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index 23d51646da..a6ab7783a8 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -508,12 +508,19 @@ template class benchmark_eigen_dense_matmul { private: + // The memory safeguard will ensure that the benchmark fails + // gracefully (i.e. without trying to actually allocate memory) if + // the estimated amount of memory exceeds the system's total memory memory_safeguard _msg; const index_t n; gsMatrix A; gsVector x, y; public: + // All tasks that should not be included in the time measurement + // must be performed in the constructor. Make sure to instanciate + // _msg(n) first as it will let the benchmark fail gracefully if the + // estimated amount of memory exceeds the system's total memory benchmark_eigen_dense_matmul(index_t n) : _msg(n), n(n), A(n,n), x(n), y(n) { @@ -537,6 +544,8 @@ class benchmark_eigen_dense_matmul return size(n); } + // This function will be called by the memory_safeguard to determine + // whether the benchmark will exceed the system's total memory static constexpr uint64_t size(index_t n) { return (2 * uint64_t(n) * uint64_t(n) + uint64_t(n)) * sizeof(T); @@ -957,9 +966,9 @@ int main(int argc, char *argv[]) index_t subdividemin = 0; index_t vsizemin = 100; real_t patchesfactor = 2; - real_t msizesfactor = 2; + real_t msizefactor = 2; real_t nrunsfactor = 1.5; - real_t vsizesfactor = 4; + real_t vsizefactor = 4; index_t msizemax = (index_t) math::min((real_t)std::numeric_limits::max(), std::sqrt((real_t)(0.8) * sizeof(real_t)*gsSysInfo::getMemoryInBytes())); index_t vsizemax = (index_t) math::min((real_t)std::numeric_limits::max(), @@ -968,10 +977,10 @@ int main(int argc, char *argv[]) gsCmdLine cmd("G+Smo performance benchmark."); cmd.printVersion(); - cmd.addReal("M", "msizesfactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizesfactor); + cmd.addReal("M", "msizefactor", "Growth factor for the sequence of msizes (only used if '-m' is not given)", msizefactor); cmd.addReal("P", "patchesfactor", "Growth factor for the sequence of patches (only used if '-p' is not given)", patchesfactor); cmd.addReal("R", "runsfactor", "Growth factor for the sequence of runs (only used if '-r' is not given)", nrunsfactor); - cmd.addReal("V", "vsizesfactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizesfactor); + cmd.addReal("V", "vsizefactor", "Growth factor for the sequence of vsizes (only used if '-v' is not given)", vsizefactor); cmd.addInt("", "msizemax", "Maximum number of unknowns in matrix/vector benchmarks (only used if '-m' is not given)", msizemax); cmd.addInt("", "msizemin", "Minimum number of unknowns in matrix/vector benchmarks (only used if '-m'is not given)", msizemin); cmd.addInt("", "patchesmax", "Maximum number of patches in assembly benchmarks (only used if '-p' is not given)", patchesmax); @@ -1042,12 +1051,12 @@ int main(int argc, char *argv[]) nthreads.push_back(i); } - // If empty fill with msizemin*msizesfactor^k, k=0, 1, 2, ..., msizemax + // If empty fill with msizemin*msizefactor^k, k=0, 1, 2, ..., msizemax if (msizes.empty()) { for(index_t i=msizemin;;) { msizes.push_back(i); - if (i<=math::min(msizemax, std::numeric_limits::max()) / (msizesfactor*msizesfactor)) - i*=msizesfactor; + if (i<=math::min(msizemax, std::numeric_limits::max()) / (msizefactor*msizefactor)) + i*=msizefactor; else break; } @@ -1065,12 +1074,12 @@ int main(int argc, char *argv[]) subdivides.push_back(i); } - // If empty fill with vsizemin*vsizesfactor^k, k=0, 1, 2, ..., vsizemax + // If empty fill with vsizemin*vsizefactor^k, k=0, 1, 2, ..., vsizemax if (vsizes.empty()) { for(index_t i=vsizemin;;) { vsizes.push_back(i); - if (i<=math::min(vsizemax, std::numeric_limits::max()) / vsizesfactor) - i*=vsizesfactor; + if (i<=math::min(vsizemax, std::numeric_limits::max()) / vsizefactor) + i*=vsizefactor; else break; } @@ -1293,12 +1302,19 @@ int main(int argc, char *argv[]) else if (gsFileManager::getExtension(fn) == "xml") { gsFileData<> file; file << benchmark; - file.save("result.xml"); + file.save(fn); } else { GISMO_ERROR("Unsupported file extension"); } //! [Execute benchmarks] + + { + gsBenchmark bm; + gsFileData<> fd(fn); + fd.getId(0, bm); + gsInfo << bm; + } return EXIT_SUCCESS; } From ef05a6e882e07912e7beb704dcbd863c25742ab9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 16:37:31 +0100 Subject: [PATCH 164/174] [ci skip] Updated README --- README.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ccc6bdeb0..95ddaa73c9 100644 --- a/README.md +++ b/README.md @@ -118,14 +118,27 @@ Release, RelWithDebInfo, MinSizeRel. * GISMO_COEFF_TYPE *double* - The arithmetic type to be used for all computations. Available options -include double, long double, float. + The arithmetic type to be used for all computations. Available +options are float, double, long double, mpfr::mpreal, mpq_class, +posit_2_0, posit_3_0, posit_3_1, posit_4_0, posit_8_0, posit_8_1, +posit_16_1, posit_32_2, posit_64_3, posit_128_4, posit_256_5 * GISMO_EXTRA_INSTANCE *not set* If set to one or more of the options available for GISMO_COEFF_TYPE the G+Smo library is compiled with extra arithmetic types enabled. +* GISMO_INDEX_TYPE *int* + + The integer type to be used for all indices. Available options are +int, int8_t, int16_t, int32_t, int64_t, long, long long + +* GISMO_SHORT_TYPE *int* + + The integer type to be used for all non-index integers, e.g., the +spatial dimension. Available options are int, int8_t, int16_t, +int32_t, int64_t, long, long long + * GISMO_EXTRA_DEBUG *OFF* If set to ON additional debugging tools are enabled during @@ -168,7 +181,12 @@ compiled. The location for installation of the library, e.g. /usr/local on some Linux systems. +* TARGET_ARCHITECTURE *auto* + If G+Smo is built in release mode optimized compiler flags for the + selected target architecture are used. *auto* determines the + architecture of the host system automatically. Available options are auto, generic, none, native and any value CPUID, e.g., skylake or apple-m1. + # Directory structure From 9b8a34067f100b785e73df8864576241adef1e63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 16:37:57 +0100 Subject: [PATCH 165/174] [ci skip] Removed obsolete code --- cmake/gsOptions.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/cmake/gsOptions.cmake b/cmake/gsOptions.cmake index ca4a563f88..5bbdc6dfab 100644 --- a/cmake/gsOptions.cmake +++ b/cmake/gsOptions.cmake @@ -19,11 +19,8 @@ if(EXISTS "${CMAKE_SOURCE_DIR}/.git") endif() endif() message (" CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") -#message (" CMAKE_C_COMPILER ${CMAKE_C_COMPILER}") -#message (" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") message (" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}") message (" CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD}") -#message (" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message (" GISMO_COEFF_TYPE ${GISMO_COEFF_TYPE}") message (" GISMO_INDEX_TYPE ${GISMO_INDEX_TYPE}") From 4c009960bfe030a5f3edb97d130bc8c46e171af8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 17:09:53 +0100 Subject: [PATCH 166/174] [ci skip] Updated README --- README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 95ddaa73c9..cb17b14417 100644 --- a/README.md +++ b/README.md @@ -183,10 +183,23 @@ Linux systems. * TARGET_ARCHITECTURE *auto* - If G+Smo is built in release mode optimized compiler flags for the - selected target architecture are used. *auto* determines the - architecture of the host system automatically. Available options are auto, generic, none, native and any value CPUID, e.g., skylake or apple-m1. - + If G+Smo is built in Release mode optimized compiler flags for the +selected target architecture are used. *auto* determines the +architecture of the host system automatically. Available options are +auto, generic, none, native and any value CPUID, e.g., skylake or +apple-m1. + +* TARGET_PROFILER *none* + + If G+Smo is build in Release mode compiler flags for the selected +target profiler are used. Available options are gprof and vtune (on +x86/x86_64 systems). + +* OFA_VERBOSE *OFF* + + If enabled the OptimizeForArchitecture script will produce verbose +output which might be helpful for debugging purposes. + # Directory structure From c0b9b0687a3f6410cda08769e72784ccb9e95ed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Wed, 2 Feb 2022 22:25:13 +0100 Subject: [PATCH 167/174] [ci skip] Removed obsolete code --- examples/performance_benchmark.cpp | 93 ++++++++++++++---------------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/examples/performance_benchmark.cpp b/examples/performance_benchmark.cpp index a6ab7783a8..cdf7017e84 100644 --- a/examples/performance_benchmark.cpp +++ b/examples/performance_benchmark.cpp @@ -322,7 +322,7 @@ class benchmark_c_array_dense_matmul { return "densematmulCarray"; } - + static constexpr gismo::metric metric() { return gismo::metric::bandwidth_gb_sec; @@ -379,7 +379,7 @@ class benchmark_eigen_memcopy { return "memcopyEigen"; } - + static constexpr gismo::metric metric() { return gismo::metric::bandwidth_gb_sec; @@ -492,7 +492,7 @@ class benchmark_eigen_axpy { return "axpyEigen"; } - + static constexpr gismo::metric metric() { return gismo::metric::bandwidth_gb_sec; @@ -560,7 +560,7 @@ class benchmark_eigen_dense_matmul { return "densematmulEigen"; } - + static constexpr gismo::metric metric() { return gismo::metric::bandwidth_gb_sec; @@ -589,7 +589,7 @@ class benchmark_poisson2d_visitor benchmark_poisson2d_visitor(std::tuple args) : benchmark_poisson2d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) {} - + benchmark_poisson2d_visitor(int numPatches, int numRefine=0, int degree=1) : _msg(numPatches, numRefine, degree), numPatches(numPatches), numRefine(numRefine), degree(degree), @@ -645,7 +645,7 @@ class benchmark_poisson2d_visitor { return "assemble2dVisitorAssembler"; } - + static constexpr gismo::metric metric() { return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); @@ -674,7 +674,7 @@ class benchmark_poisson3d_visitor benchmark_poisson3d_visitor(std::tuple args) : benchmark_poisson3d_visitor(std::get<0>(args), std::get<1>(args), std::get<2>(args)) {} - + benchmark_poisson3d_visitor(int numPatches, int numRefine=0, int degree=1) : _msg(numPatches, numRefine, degree), numPatches(numPatches), numRefine(numRefine), degree(degree), @@ -730,7 +730,7 @@ class benchmark_poisson3d_visitor { return "assemble3dVisitorAssembler"; } - + static constexpr gismo::metric metric() { return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); @@ -751,46 +751,46 @@ class benchmark_poisson2d_expression_assembler gsMultiPatch geo; gsMultiBasis bases; gsBoundaryConditions bc; - + gsExprAssembler A; typename gsExprAssembler<>::geometryMap G; typename gsExprAssembler<>::space u; gsFunctionExpr f; expr::gsComposition ff; - + public: template benchmark_poisson2d_expression_assembler(std::tuple args) : benchmark_poisson2d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) {} - + benchmark_poisson2d_expression_assembler(int numPatches, int numRefine=0, int degree=1) : _msg(numPatches, numRefine, degree), numPatches(numPatches), numRefine(numRefine), degree(degree), geo(gsNurbsCreator<>::BSplineSquareGrid(numPatches, numPatches, 1.0)), bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), f("0.0", 2), ff(A.getCoeff(f, G)) - { + { // h-refine each basis for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); - + // k-refinement (set degree) for (std::size_t i = 0; i < bases.nBases(); ++ i) bases[i].setDegreePreservingMultiplicity(degree); - + // set the geometry map to boundary conditions bc.setGeoMap(geo); - + // setup boundary conditions u.setup(bc, dirichlet::l2Projection, 0); // set elements used for numerical integration A.setIntegrationElements(bases); - + // initialize the system - A.initSystem(); + A.initSystem(); } uint64_t operator()() @@ -801,7 +801,7 @@ class benchmark_poisson2d_expression_assembler , u * ff * meas(G) //rhs vector ); - + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); } @@ -836,7 +836,7 @@ class benchmark_poisson2d_expression_assembler { return "assemble2dExpressionAssembler"; } - + static constexpr gismo::metric metric() { return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); @@ -857,46 +857,46 @@ class benchmark_poisson3d_expression_assembler gsMultiPatch geo; gsMultiBasis bases; gsBoundaryConditions bc; - + gsExprAssembler A; typename gsExprAssembler<>::geometryMap G; typename gsExprAssembler<>::space u; gsFunctionExpr f; expr::gsComposition ff; - + public: template benchmark_poisson3d_expression_assembler(std::tuple args) : benchmark_poisson3d_expression_assembler(std::get<0>(args), std::get<1>(args), std::get<2>(args)) {} - + benchmark_poisson3d_expression_assembler(int numPatches, int numRefine=0, int degree=1) : _msg(numPatches, numRefine, degree), numPatches(numPatches), numRefine(numRefine), degree(degree), geo(gsNurbsCreator<>::BSplineCubeGrid(numPatches, numPatches, numPatches, 1.0)), bases(geo, true), A(1,1), G(A.getMap(geo)), u(A.getSpace(bases)), f("0.0", 3), ff(A.getCoeff(f, G)) - { + { // h-refine each basis for (int i = 0; i < numRefine; ++i) bases.uniformRefine(); - + // k-refinement (set degree) for (std::size_t i = 0; i < bases.nBases(); ++ i) bases[i].setDegreePreservingMultiplicity(degree); - + // set the geometry map to boundary conditions bc.setGeoMap(geo); - + // setup boundary conditions u.setup(bc, dirichlet::l2Projection, 0); // set elements used for numerical integration A.setIntegrationElements(bases); - + // initialize the system - A.initSystem(); + A.initSystem(); } uint64_t operator()() @@ -907,7 +907,7 @@ class benchmark_poisson3d_expression_assembler , u * ff * meas(G) //rhs vector ); - + return sizeof(T) * (A.matrix().nonZeros() + A.rhs().rows()); } @@ -942,7 +942,7 @@ class benchmark_poisson3d_expression_assembler { return "assemble3dExpressionAssembler"; } - + static constexpr gismo::metric metric() { return (gismo::metric)(gismo::metric::runtime_sec + gismo::metric::speedup); @@ -1001,7 +1001,7 @@ int main(int argc, char *argv[]) cmd.addString("o", "output", "Name of the output file", fn); cmd.addSwitch("list", "List all benchmarks and exit", list); cmd.addSwitch("all", "Run all benchmarks", all); - + try { cmd.getValues(argc,argv); } catch (int rv) { return rv; } //! [Parse command line] @@ -1032,7 +1032,7 @@ int main(int argc, char *argv[]) << " with increasing number of patches" << "\n" << "#16: " << benchmark_poisson3d_expression_assembler::descr() << " with increasing number of subdivisions" << "\n"; - + return EXIT_SUCCESS; } //! [List benchmarks and exit] @@ -1044,7 +1044,7 @@ int main(int argc, char *argv[]) for(index_t i=1; i<=16; ++i) benchmarks.push_back(i); } - + // If empty fill with 1, 2, 4, ..., maximum number of OpenMP threads if (nthreads.empty()) { for(index_t i=1; i<=omp_get_max_threads(); i*=2) @@ -1073,7 +1073,7 @@ int main(int argc, char *argv[]) for(index_t i=subdividemin; i > @@ -1199,7 +1199,7 @@ int main(int argc, char *argv[]) nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); break; } - + case (13): { // Benchmark: expression assembler-based Poisson 2d assembler with increasing number of patches benchmark.create > @@ -1239,7 +1239,7 @@ int main(int argc, char *argv[]) nruns, nthreads, " with increasing number of subdivisions (#patches=1, degree=2)"); break; } - + default: GISMO_ERROR("Invalid benchmark"); } @@ -1249,7 +1249,7 @@ int main(int argc, char *argv[]) { // Memory copy ratio auto bmA = benchmark.find(benchmark_c_array_memcopy::label()); auto bmB = benchmark.find(benchmark_eigen_memcopy::label()); - + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { auto bm = util::ratio("memcopyRatio", "Memory copy (gsVector : native C array)", *bmB, *bmA); @@ -1260,7 +1260,7 @@ int main(int argc, char *argv[]) { // Dot product ratio auto bmA = benchmark.find(benchmark_c_array_dotproduct::label()); auto bmB = benchmark.find(benchmark_eigen_dotproduct::label()); - + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { auto bm = util::ratio("dotproductRatio", "Dot product (gsVector : native C array)", *bmB, *bmA); @@ -1271,7 +1271,7 @@ int main(int argc, char *argv[]) { // AXPY ratio auto bmA = benchmark.find(benchmark_c_array_axpy::label()); auto bmB = benchmark.find(benchmark_eigen_axpy::label()); - + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { auto bm = util::ratio("axpyRatio", "AXPY (gsVector : native C array)", *bmB, *bmA); @@ -1282,15 +1282,15 @@ int main(int argc, char *argv[]) { // Dense matrix-vector multiplication ratio auto bmA = benchmark.find(benchmark_c_array_dense_matmul::label()); auto bmB = benchmark.find(benchmark_eigen_dense_matmul::label()); - + if (bmA != std::end(benchmark.get()) && bmB != std::end(benchmark.get())) { auto bm = util::ratio("densematmulRatio", "Dense matrix-vector multiplication (gsMatrix/gsVector : native C array)", *bmB, *bmA); benchmark.get().push_back( give(bm) ); } - } - + } + if (fn.empty()) gsInfo << benchmark << "\n"; else if (gsFileManager::getExtension(fn) == "tex") { @@ -1309,12 +1309,5 @@ int main(int argc, char *argv[]) } //! [Execute benchmarks] - { - gsBenchmark bm; - gsFileData<> fd(fn); - fd.getId(0, bm); - gsInfo << bm; - } - return EXIT_SUCCESS; } From 68cbeae3813c4da5939bcc327f306be03dc31719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 3 Feb 2022 18:11:35 +0100 Subject: [PATCH 168/174] [ci skip] update OFA --- cmake/ofa/ChecksArm.txt | 112 +++++++++++++++++++++++-------- cmake/ofa/HandleArmOptions.cmake | 7 +- 2 files changed, 89 insertions(+), 30 deletions(-) diff --git a/cmake/ofa/ChecksArm.txt b/cmake/ofa/ChecksArm.txt index ecbaed6472..cd68fa3294 100644 --- a/cmake/ofa/ChecksArm.txt +++ b/cmake/ofa/ChecksArm.txt @@ -59,42 +59,110 @@ # # pop_enable:SunPro -aes;cstdlib;exit;0 -bf16,sve;arm_sve.h;svbfdot;svfloat32_t(),svbfloat16_t(),svbfloat16_t() -crc;cstdlib;exit;0 +# ARM (aarch32) 32-bit + +# armv4 : no options +# armv4t : no options + +# armv5t : no options +# armv5te : no options +# armv5tej : no options + +# armv6 : fp nofp vfpv2 +# armv6j : fp nofp vfpv2 +# armv6k : fp nofp vfpv2 +# armv6z : fp nofp vfpv2 +# armv6kz : fp nofp vfpv2 +# armv6zk : fp nofp vfpv2 +# armv6t2 : fp nofp vfpv2 +# armv6-m : no options +# armv6s-m : no options +fp;arm_neon.h;vcvt_f16_f32;float32x4_t() +vfpv2;cstdlib;exit;0 + +# armv7 : fp nofp vfpv3-d16 +vfpv3-d16;cstdlib;exit;0;vfpv3_d16 + +# armv7-a : mp sec fp vfpv3 vfpv3-d16-fp16 vfpv3-fp16 vfpv4-d16 vfpv4 simd +# neon-fp16 neon-vfpv4 nosimd nofp vfpv3-d16 neon neon-vfpv3 +# armv7ve : vfpv3-d16 vfpv3 vfpv3-d16-fp16 vfpv3-fp16 fp vfpv4 neon neon-fp16 +# simd nosimd nofp vfpv4-d16 neon-vfpv3 neon-vfpv4 +mp;cstdlib;exit;0 +neon;cstdlib;exit;0 +neon-fp16;cstdlib;exit;0;neon_fp16 +neon-vfpv3;cstdlib;exit;0;neon_vfpv3 +neon-vfpv4;cstdlib;exit;0;neon_vfpv4 +sec;cstdlib;exit;0 +simd;cstdlib;exit;0 +vfpv3;cstdlib;exit;0 +vfpv3-d16-fp16;cstdlib;exit;0;vfpv3_d16_fp16 +vfpv3-fp16;cstdlib;exit;0;vfpv3_fp16 +vfpv4;cstdlib;exit;0 +vfpv4-d16;cstdlib;exit;0;vfpv4_d16 + +# armv7-r : fp.sp fp vfpv3xd-fp16 vfpv3-d16-fp16 idiv nofp noidiv vfpv3xd vfpv3-d16 +fp.sp;cstdlib;exit;0;fp_sp +fp.dp;cstdlib;exit;0;fp_dp +idiv;cstdlib;exit;0 +vfpv3dx;cstdlib;exit;0 +vfpv3dx-fp16;cstdlib;exit;0;vfpv3dx_fp16 + +# armv7-m : no options +# armv7e-m : fp fpv5 fp.dp nofp vfpv4-sp-d16 fpv5-d16 +fpv5;cstdlib;exit;0 +fpv5_d16;cstdlib;exit;0 +vfpv4-sp-d16;cstdlib;exit;0;vfpv4_sp_d16 + +# armv8-a : crc simd crypto nocrypto nofp sb predres +crc;arm_acle.h;__crc32b;(uint32_t)0,(uint8_t)0 crypto;arm_neon.h;vaesdq_u8;uint8x16_t(), uint8x16_t() +sb;cstdlib;exit;0 +predres;cstdlib;exit;0 + +# armv8-r : crc fp.sp simd crypto nocrypto nofp +# armv8.1-a : simd crypto nocrypto nofp sb predres +# armv8.2-a : simd fp16 fp16fml crypto nocrypto nofp dotprod sb predres i8mm bf16 +bf16,sve;arm_sve.h;svbfdot;svfloat32_t(),svbfloat16_t(),svbfloat16_t() dotprod;arm_neon.h;svdot;svint32_t(),svint8_t(),svint8_t() +fp16;arm_neon.h;vabdq_f16;float16x8_t(),float16x8_t() +fp16fml;arm_neon.h;vfmlalq_high_f16;float32x4_t(),float16x8_t(),float16x8_t() +i8mm,sve;arm_sve.h;svmmla;svint32_t(),svint8_t(),svint8_t() +simd;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() + +# armv8.3-a : simd fp16 fp16fml crypto nocrypto nofp dotprod sb predres i8mm bf16 +# armv8.4-a : simd fp16 crypto nocrypto nofp sb predres i8mm bf16 +# armv8.5-a : simd fp16 crypto nocrypto nofp i8mm bf16 +# armv8.6-a : simd fp16 crypto nocrypto nofp i8mm bf16 + +# ARM64 (aarch64) 64-bit + +# armv8.x-a : fp simd crypto crc lse fp16 rcpc rdma dotprod aes sha2 sha3 sm4 fp16fml sve profile rng memtag sb ssbs predres sve2 sve2-sm4 sve2-aes sve2-sha3 sve2-bitperm tme i8mm f32mm f64mm bf16 flagm pauth asimd crc32 +crc32;arm_acle.h;__crc32b;(uint32_t)0,(uint8_t)0 +simd;cstdlib;exit;0;asimd +aes,crypto;arm_neon.h;vaesdq_u8;uint8x16_t(), uint8x16_t() dsp,sve;arm_sve.h;svqadd_z;svbool_t(),svint8_t(),svint8_t() f32mm,sve;arm_sve.h;svmmla;svfloat32_t(),svfloat32_t(),svfloat32_t() f64mm,sve;arm_sve.h;svmmla;svfloat64_t(),svfloat64_t(),svfloat64_t() flagm;cstdlib;exit;0 -fp;arm_neon.h;vcvt_f16_f32;float32x4_t() -fp16;arm_neon.h;vabdq_f16;float16x8_t(),float16x8_t() -fp16fml;arm_neon.h;vfmlalq_high_f16;float32x4_t(),float16x8_t(),float16x8_t() -fd_dp;cstdlib;exit;0 -fp_sp;cstdlib;exit;0 -i8mm,sve;arm_sve.h;svmmla;svint32_t(),svint8_t(),svint8_t() -idiv;cstdlib;exit;0 lse;cstdlib;exit;0 memtag;cstdlib;exit;0 mve;cstdlib;exit;0 mve_fp;cstdlib;exit;0 -neon_fp16;cstdlib;exit;0 -neon_vfpv4;cstdlib;exit;0 pauth;cstdlib;exit;0 -predres;cstdlib;exit;0 profile;cstdlib;exit;0 ras;cstdlib;exit;0 rcpc;cstdlib;exit;0 rdm;cstdlib;exit;0 rdma;cstdlib;exit;0 rng;cstdlib;exit;0 -sb;cstdlib;exit;0 sec;cstdlib;exit;0 -sha2;cstdlib;exit;0 -sha3;cstdlib;exit;0 -simd;arm_neon.h;vaddq_u32;uint32x4_t(),uint32x4_t() +sha2,crypto;arm_neon.h;vsha256hq_u32;uint32x4_t(),uint32x4_t(),uint32x4_t() +sha3;arm_neon.h;vsha512hq_u64;uint64x2_t(),uint64x2_t(),uint64x2_t() +sm4;arm_neon.h;vsm4eq_u32;uint32x4_t(), uint32x4_t() ssbs;cstdlib;exit;0 +tme;cstdlib;exit;0 +zcm;cstdlib;exit;0 +zcz;cstdlib;exit;0 # SVE sve;arm_sve.h;svwhilelt_b64;0,1 @@ -105,13 +173,3 @@ sve2-aes;arm_sve.h;svaesd;svuint8_t(),svuint8_t() sve2-bitperm;arm_sve.h;svbdep;svuint8_t(),svuint8_t() sve2-sha3;arm_sve.h;svrax1;svint64_t(),svint64_t() sve2-sm4;arm_sve.h;svsm4e;svuint32_t(),svuint32_t() - -tme;cstdlib;exit;0 -vfpv3;cstdlib;exit;0 -vfpv3_d16;cstdlib;exit;0 -vfpv3_d16_fp16;cstdlib;exit;0 -vfpv3_fp16;cstdlib;exit;0 -vfpv4;cstdlib;exit;0 -vfpv4_d16;cstdlib;exit;0 -zcm;cstdlib;exit;0 -zcz;cstdlib;exit;0 diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index b1866b9c89..758160a60d 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -18,7 +18,7 @@ include(CheckIncludeFileCXX) macro(OFA_HandleArmOptions) - # Special treatment for "native" + # Special treatment for "native" flag if(TARGET_ARCHITECTURE STREQUAL "native") if(MSVC) # MSVC (on Windows) @@ -761,9 +761,10 @@ macro(OFA_HandleArmOptions) message(STATUS "[OFA] CPU microarchitectures (-mtune): " ${_str}) endif() if(_available_extension_list) + list(LENGTH _available_extension_list _len) string(REPLACE ";" ", " _str "${_available_extension_list}") string(TOUPPER ${_str} _str) - message(STATUS "[OFA] Extensions (available): ${_str}") + message(STATUS "[OFA] Extensions (${_len} available): ${_str}") endif() endif() @@ -782,7 +783,7 @@ macro(OFA_HandleArmOptions) set(_mtune_flag "-mtune") foreach(_flag ${_mtune_flag_list}) - AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) if(_ok) break() endif() From aa96027a938a2faec376dd5e794d469717142bec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 3 Feb 2022 20:14:51 +0100 Subject: [PATCH 169/174] [ci skip] fix OFA - macros for AMD --- cmake/ofa/HandleX86Options.cmake | 160 ++++++++++++++++--------------- 1 file changed, 85 insertions(+), 75 deletions(-) diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index 0ddd97a45e..05eb13e34b 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -177,7 +177,66 @@ macro(OFA_HandleX86Options) _goldmont_plus() endmacro() - # TODO: Define similar macros for AMD + # Define macros for AMD + macro(_k8) + list(APPEND _march_flag_list "k8") + list(APPEND _available_extension_list "mmx" "3dnow" "sse" "sse2") + endmacro() + macro(_k8_sse3) + list(APPEND _march_flag_list "k8-sse3") + _k8() + list(APPEND _available_extension_list "sse3") + endmacro() + macro(_barcelona) # amd10h + list(APPEND _march_flag_list "barcelona") + _k8_sse3() + list(APPEND _available_extension_list "sse4a" "abm") + endmacro() + macro(_amd14h) + list(APPEND _march_flag_list "btver1") + _barcelona() + list(APPEND _available_extension_list "cx16" "ssse3") + endmacro() + macro(_bulldozer) # amd15h + list(APPEND _march_flag_list "bdver1") + _amd14h() + list(APPEND _available_extension_list "sse4.1" "sse4.2" "avx" "xop" "fma4" "lwp" "aes" "pclmul") + endmacro() + macro(_piledriver) + list(APPEND _march_flag_list "bdver2") + _bulldozer() + list(APPEND _available_extension_list "fma" "f16c" "bmi" "tbm") + endmacro() + macro(_steamroller) + list(APPEND _march_flag_list "bdver3") + _piledriver() + list(APPEND _available_extension_list "fsgsbase") + endmacro() + macro(_excavator) + list(APPEND _march_flag_list "bdver4") + _steamroller() + list(APPEND _available_extension_list "bmi2" "avx2" "movbe") + endmacro() + macro(_amd16h) + list(APPEND _march_flag_list "btver2") + _amd14h() + list(APPEND _available_extension_list "movbe" "sse4.1" "sse4.2" "avx" "f16c" "bmi" "pclmul" "aes") + endmacro() + macro(_zen) + list(APPEND _march_flag_list "znver1") + _amd16h() + list(APPEND _available_extension_list "bmi2" "fma" "fsgsbase" "avx2" "adcx" "rdseed" "mwaitx" "sha" "clzero" "xsavec" "xsaves" "clflushopt" "popcnt") + endmacro() + macro(_zen2) + list(APPEND _march_flag_list "znver2") + _zen() + list(APPEND _available_extension_list "clwb" "rdpid" "wbnoinvd") + endmacro() + macro(_zen3) + list(APPEND _march_flag_list "znver3") + _zen3() + list(APPEND _available_extension_list "pku" "vpclmulqdq" "vaes") + endmacro() # Intel if(TARGET_ARCHITECTURE STREQUAL "core" OR TARGET_ARCHITECTURE STREQUAL "core2") @@ -256,83 +315,34 @@ macro(OFA_HandleX86Options) list(APPEND _march_flag_list "core2") list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3") - # AMD + # AMD elseif(TARGET_ARCHITECTURE STREQUAL "k8") - list(APPEND _march_flag_list "k8") - list(APPEND _available_extension_list "sse" "sse2") + _k8() elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3") - list(APPEND _march_flag_list "k8-sse3") - list(APPEND _march_flag_list "k8") - list(APPEND _available_extension_list "sse" "sse2" "sse3") - elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") - list(APPEND _march_flag_list "btver2") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "f16c") + k8_sse3() + elseif(TARGET_ARCHITECTURE STREQUAL "barcelona" OR + TARGET_ARCHITECTURE STREQUAL "istanbul" OR + TARGET_ARCHITECTURE STREQUAL "magny-cours") + _barcelona() elseif(TARGET_ARCHITECTURE STREQUAL "amd14h") - list(APPEND _march_flag_list "btver1") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen3") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen2") - list(APPEND _march_flag_list "znver2") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "zen") - list(APPEND _march_flag_list "znver1") - _skylake() - list(APPEND _available_extension_list "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "excavator") - list(APPEND _march_flag_list "bdver4") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "avx2" "xop" "fma4" "fma" "f16c" "bmi" "bmi2" "rdrnd") - elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") - list(APPEND _march_flag_list "bdver3") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") + _amd14h() + elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer" OR + TARGET_ARCHITECTURE STREQUAL "interlagos") + _bulldozer() elseif(TARGET_ARCHITECTURE STREQUAL "piledriver") - list(APPEND _march_flag_list "bdver2") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4" "fma" "f16c" "bmi") - elseif(TARGET_ARCHITECTURE STREQUAL "interlagos") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer") - list(APPEND _march_flag_list "bdver1") - list(APPEND _march_flag_list "bulldozer") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4") - elseif(TARGET_ARCHITECTURE STREQUAL "barcelona") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "istanbul") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") - elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours") - list(APPEND _march_flag_list "barcelona") - list(APPEND _march_flag_list "core2") - list(APPEND _available_extension_list "sse" "sse2" "sse3" "sse4a") + _piledriver() + elseif(TARGET_ARCHITECTURE STREQUAL "steamroller") + _steamroller() + elseif(TARGET_ARCHITECTURE STREQUAL "excavator") + _excavator() + elseif(TARGET_ARCHITECTURE STREQUAL "amd16h") + _amd16h() + elseif(TARGET_ARCHITECTURE STREQUAL "zen") + _zen() + elseif(TARGET_ARCHITECTURE STREQUAL "zen2") + _zen2() + elseif(TARGET_ARCHITECTURE STREQUAL "zen3") + _zen3() # Others elseif(TARGET_ARCHITECTURE STREQUAL "generic") @@ -456,7 +466,7 @@ macro(OFA_HandleX86Options) else() set(_extension "${_extension_flag}") endif() - + list(APPEND _check_extension_list "${_extension}") # Define USE_<_extension_flag> variable From 9369705adaa5ca3173ff3b7a84b5e11ef57b856d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Thu, 10 Feb 2022 23:04:55 +0100 Subject: [PATCH 170/174] [ci skip] update OFA --- cmake/ofa/HandleArmOptions.cmake | 47 +++++++++++++++++++++++++------- cmake/ofa/HandleX86Options.cmake | 6 ++-- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/cmake/ofa/HandleArmOptions.cmake b/cmake/ofa/HandleArmOptions.cmake index 758160a60d..e7f23e095a 100644 --- a/cmake/ofa/HandleArmOptions.cmake +++ b/cmake/ofa/HandleArmOptions.cmake @@ -746,6 +746,7 @@ macro(OFA_HandleArmOptions) message(FATAL_ERROR "[OFA] Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.") endif() + # Clean list of available extensions list(SORT _available_extension_list) list(REMOVE_DUPLICATES _available_extension_list) @@ -782,17 +783,18 @@ macro(OFA_HandleArmOptions) set(_march_flag "-march=") set(_mtune_flag "-mtune") - foreach(_flag ${_mtune_flag_list}) - AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) - if(_ok) - break() - endif() - endforeach() + # foreach(_flag ${_mtune_flag_list}) + # AddCXXCompilerFlag("${_mcpu_flag}${_flag}" FLAGS OFA_ARCHITECTURE_FLAGS RESULT _ok) + # if(_ok) + # break() + # endif() + # endforeach() if(NOT _ok) # Fallback: set -march and -mtune flags set(_check_extension_list) set(_check_extension_flag_list) + set(_disable_extension_flag_list) set(_enable_extension_flag_list) set(_ignore_extension_flag_list) @@ -885,7 +887,7 @@ macro(OFA_HandleArmOptions) # Define USE_<_extension_flag> variable set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") # If not specified externally, set the value of the # USE_<_extension_flag> variable to TRUE if it is found in the list @@ -924,7 +926,7 @@ macro(OFA_HandleArmOptions) _ofa_find(_available_extension_list "${_extension_flag}" _found) set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") if(${_useVar}) # Add <_extension_flag> to list of enabled extensions (if supported) @@ -939,7 +941,18 @@ macro(OFA_HandleArmOptions) endif() list(APPEND _enable_extension_flag_list "${_extension_flag}") else() - list(APPEND _ignore_extension_flag_list "${_extension_flag}") + # Add <_extension_flag> to list of disabled extensions (if supported) + AddCXXCompilerFlag("${_march_flag}${_march}+no${_extension_flag}") + set(_haveVar "HAVE_${_march_flag}${_march}+no${_extension_flag}") + string(REGEX REPLACE "[-.+/:= ]" "_" _haveVar "${_haveVar}") + if(NOT ${_haveVar}) + if(OFA_VERBOSE) + message(STATUS "[OFA] Ignoring flag ${_march_flag}${_march}+no${_extension_flag} because checks failed") + endif() + list(APPEND _ignore_extension_flag_list "${_extension_flag}") + continue() + endif() + list(APPEND _disable_extension_flag_list "${_extension_flag}") endif() endforeach() @@ -960,6 +973,14 @@ macro(OFA_HandleArmOptions) string(TOUPPER ${_str} _str) message(STATUS "[OFA] Extensions (${_len} enabled): ${_str}") endif() + # Print disabled extension flags + if(_disable_extension_flag_list) + list(LENGTH _disable_extension_flag_list _len) + list(SORT _disable_extension_flag_list) + string(REPLACE ";" ", " _str "${_disable_extension_flag_list}") + string(TOUPPER ${_str} _str) + message(STATUS "[OFA] Extensions (${_len} disabled): ${_str}") + endif() # Print ignored extension flags if(_ignore_extension_flag_list) list(LENGTH _ignore_extension_flag_list _len) @@ -999,7 +1020,7 @@ macro(OFA_HandleArmOptions) endif() foreach(_flag ${_enable_extension_flag_list}) string(TOUPPER "${_flag}" _flag) - string(REPLACE "." "_" _flag "__${_flag}__") + string(REPLACE "[-.+/:= ]" "_" _flag "__${_flag}__") add_definitions("-D${_flag}") endforeach(_flag) @@ -1027,6 +1048,12 @@ macro(OFA_HandleArmOptions) set(_march_plus_extensions "${_march_plus_extensions}+${_flag}") endif(_ok) endforeach() + foreach(_flag ${_disable_extension_flag_list}) + AddCXXCompilerFlag("-march=${_march_plus_extensions}+no${_flag}" RESULT _ok) + if(_ok) + set(_march_plus_extensions "${_march_plus_extensions}+no${_flag}") + endif(_ok) + endforeach() AddCXXCompilerFlag("-march=${_march_plus_extensions}" FLAGS OFA_ARCHITECTURE_FLAGS) break() endif() diff --git a/cmake/ofa/HandleX86Options.cmake b/cmake/ofa/HandleX86Options.cmake index 05eb13e34b..d0c875e249 100644 --- a/cmake/ofa/HandleX86Options.cmake +++ b/cmake/ofa/HandleX86Options.cmake @@ -472,7 +472,7 @@ macro(OFA_HandleX86Options) # Define USE_<_extension_flag> variable set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") # If not specified externally, set the value of the # USE_<_extension_flag> variable to TRUE if it is found in the list @@ -511,7 +511,7 @@ macro(OFA_HandleX86Options) _ofa_find(_available_extension_list "${_extension_flag}" _found) set(_useVar "USE_${_extension_flag}") string(TOUPPER "${_useVar}" _useVar) - string(REPLACE "." "_" _useVar "${_useVar}") + string(REPLACE "[-.+/:= ]" "_" _useVar "${_useVar}") if(${_useVar}) # Add <_extension_flag> to list of enabled extensions (if supported) @@ -625,7 +625,7 @@ macro(OFA_HandleX86Options) endif() foreach(_extension ${_enable_extension_flag_list}) string(TOUPPER "${_extension}" _extension) - string(REPLACE "." "_" _extension "__${_extension}__") + string(REPLACE "[-.+/:= ]" "_" _extension "__${_extension}__") add_definitions("-D${_extension}") endforeach(_extension) From 6fc133bcb7b0677a2b019ceef9ed465a67edf252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 17 May 2022 07:43:42 +0200 Subject: [PATCH 171/174] Fixed ambigious naming of enums --- src/gsParallel/gsOpenMP.cpp | 40 ++++++++++++++++++------------------- src/gsParallel/gsOpenMP.h | 4 ++-- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/gsParallel/gsOpenMP.cpp b/src/gsParallel/gsOpenMP.cpp index ac8ff244aa..192b2a0873 100644 --- a/src/gsParallel/gsOpenMP.cpp +++ b/src/gsParallel/gsOpenMP.cpp @@ -189,7 +189,7 @@ int omp_get_max_task_priority(void) void omp_init_lock(omp_lock_t *arg) { - arg->lock = UNLOCKED; + arg->lock = OMP_UNLOCKED; } void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) @@ -199,16 +199,16 @@ void omp_init_lock_with_hint(omp_lock_t *arg, omp_lock_hint_t hint) void omp_destroy_lock(omp_lock_t *arg) { - arg->lock = INIT; + arg->lock = OMP_INIT; } void omp_set_lock(omp_lock_t *arg) { - if (arg->lock == UNLOCKED) + if (arg->lock == OMP_UNLOCKED) { - arg->lock = LOCKED; + arg->lock = OMP_LOCKED; } - else if (arg->lock == LOCKED) + else if (arg->lock == OMP_LOCKED) { fprintf(stderr, "error: deadlock in using lock variable\n"); exit(1); @@ -221,11 +221,11 @@ void omp_set_lock(omp_lock_t *arg) void omp_unset_lock(omp_lock_t *arg) { - if (arg->lock == LOCKED) + if (arg->lock == OMP_LOCKED) { - arg->lock = UNLOCKED; + arg->lock = OMP_UNLOCKED; } - else if (arg->lock == UNLOCKED) + else if (arg->lock == OMP_UNLOCKED) { fprintf(stderr, "error: lock not set\n"); exit(1); @@ -239,12 +239,12 @@ void omp_unset_lock(omp_lock_t *arg) int omp_test_lock(omp_lock_t *arg) { - if (arg->lock == UNLOCKED) + if (arg->lock == OMP_UNLOCKED) { - arg->lock = LOCKED; + arg->lock = OMP_LOCKED; return 1; } - else if (arg->lock == LOCKED) + else if (arg->lock == OMP_LOCKED) { return 0; } @@ -256,7 +256,7 @@ int omp_test_lock(omp_lock_t *arg) void omp_init_nest_lock(omp_nest_lock_t *arg) { - arg->owner = NOOWNER; + arg->owner = OMP_NOOWNER; arg->count = 0; } @@ -268,19 +268,19 @@ void omp_init_nest_lock_with_hint(omp_nest_lock_t *arg, void omp_destroy_nest_lock(omp_nest_lock_t *arg) { - arg->owner = NOOWNER; - arg->count = UNLOCKED; + arg->owner = OMP_NOOWNER; + arg->count = OMP_UNLOCKED; } void omp_set_nest_lock(omp_nest_lock_t *arg) { - if (arg->owner == MASTER && arg->count >= 1) + if (arg->owner == OMP_MASTER && arg->count >= 1) { arg->count++; } - else if (arg->owner == NOOWNER && arg->count == 0) + else if (arg->owner == OMP_NOOWNER && arg->count == 0) { - arg->owner = MASTER; + arg->owner = OMP_MASTER; arg->count = 1; } else @@ -292,15 +292,15 @@ void omp_set_nest_lock(omp_nest_lock_t *arg) void omp_unset_nest_lock(omp_nest_lock_t *arg) { - if (arg->owner == MASTER && arg->count >= 1) + if (arg->owner == OMP_MASTER && arg->count >= 1) { arg->count--; if (arg->count == 0) { - arg->owner = NOOWNER; + arg->owner = OMP_NOOWNER; } } - else if (arg->owner == NOOWNER && arg->count == 0) + else if (arg->owner == OMP_NOOWNER && arg->count == 0) { fprintf(stderr, "error: lock not set\n"); exit(1); diff --git a/src/gsParallel/gsOpenMP.h b/src/gsParallel/gsOpenMP.h index d36f5bca87..69a48911b8 100644 --- a/src/gsParallel/gsOpenMP.h +++ b/src/gsParallel/gsOpenMP.h @@ -170,7 +170,7 @@ typedef struct omp_lock_t { int lock; } omp_lock_t; -enum { UNLOCKED = -1, INIT, LOCKED }; +enum { OMP_UNLOCKED = -1, OMP_INIT, OMP_LOCKED }; void GISMO_EXPORT omp_init_lock(omp_lock_t *arg); @@ -207,7 +207,7 @@ typedef struct omp_nest_lock_t { int count; } omp_nest_lock_t; -enum { NOOWNER = -1, MASTER = 0 }; +enum { OMP_NOOWNER = -1, OMP_MASTER = 0 }; void GISMO_EXPORT omp_init_nest_lock(omp_nest_lock_t *arg); From 1b7231aae0a357213286117cf8186afe1f0d56f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 17 May 2022 07:44:15 +0200 Subject: [PATCH 172/174] Changed some attributes from protected to public as they are needed by the p-multigrid solver --- .../Eigen/src/IterativeLinearSolvers/IncompleteLUT.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index a5e4383df7..155b3f5900 100644 --- a/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/external/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -197,14 +197,20 @@ class IncompleteLUT : public SparseSolverBase m_P; // Fill-reducing permutation PermutationMatrix m_Pinv; // Inverse permutation }; From c5600d4c59469649f6db0102b7183e0126f0f911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 17 May 2022 07:44:54 +0200 Subject: [PATCH 173/174] Migrated the XBraid p-multigrid solver to the new assembler --- .../gsXBraid/examples/gsXBraidMultigrid.h | 46 +++++++++++++------ .../examples/xbraid_heatEquation_example.cpp | 17 ++++--- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/extensions/gsXBraid/examples/gsXBraidMultigrid.h b/extensions/gsXBraid/examples/gsXBraidMultigrid.h index aec574d164..a87d169592 100644 --- a/extensions/gsXBraid/examples/gsXBraidMultigrid.h +++ b/extensions/gsXBraid/examples/gsXBraidMultigrid.h @@ -309,7 +309,8 @@ namespace gismo { w_n.setInterfaceCont(0); if (typeBCHandling == 1) { - w_n.addBc(bcInfo.get("Dirichlet")); + w_n.setup(bcInfo, dirichlet::l2Projection, 0); + //#w_n.addBc(bcInfo.get("Dirichlet")); } ex2.setIntegrationElements(basesH); ex2.initSystem(); @@ -369,7 +370,8 @@ namespace gismo { w_n.setInterfaceCont(0); if (typeBCHandling == 1) { - w_n.addBc(bcInfo.get("Dirichlet")); + w_n.setup(bcInfo, dirichlet::l2Projection, 0); + //#w_n.addBc(bcInfo.get("Dirichlet")); } ex2.setIntegrationElements(basesL); ex2.initSystem(); @@ -543,12 +545,14 @@ namespace gismo { space u_M = M.getSpace(*m_bases[i]); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); - u_K.addBc( m_bcInfo_ptr->get("Dirichlet") ); - u_M.addBc( m_bcInfo_ptr->get("Dirichlet") ); + u_K.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + u_M.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#u_K.addBc( m_bcInfo_ptr->get("Dirichlet") ); + //#u_M.addBc( m_bcInfo_ptr->get("Dirichlet") ); // Set the source term - variable ff_K = K.getCoeff(rhs, G_K); - variable ff_M = M.getCoeff(rhs, G_M); + auto ff_K = K.getCoeff(rhs, G_K); + auto ff_M = M.getCoeff(rhs, G_M); // Initialize and assemble the system matrix K.initSystem(); @@ -569,6 +573,7 @@ namespace gismo { } real_t Time_Assembly = clock.stop(); + GISMO_UNUSED(Time_Assembly); // Resize vector of operators @@ -607,6 +612,7 @@ namespace gismo { } } real_t Time_Transfer = clock.stop(); + GISMO_UNUSED(Time_Transfer); // Obtain operators with Galerkin projection (TO DO) clock.restart(); @@ -631,7 +637,7 @@ namespace gismo { } } real_t Time_Assembly_Galerkin = clock.stop(); - + GISMO_UNUSED(Time_Assembly_Galerkin); // Setting up the subspace corrected mass smoother clock.restart(); @@ -647,6 +653,7 @@ namespace gismo { } } real_t Time_SCMS = clock.stop(); + GISMO_UNUSED(Time_SCMS); // Determine ILUT factorizations at each level clock.restart(); @@ -686,7 +693,9 @@ namespace gismo { } } } - real_t Time_ILUT_Factorization = clock.stop(); + real_t Time_ILUT_Factorization = clock.stop(); + GISMO_UNUSED(Time_ILUT_Factorization); + clock.restart(); if (Base::typeSmoother == 5) { @@ -807,6 +816,8 @@ namespace gismo { } real_t Time_Block_ILUT_Factorization = clock.stop(); + GISMO_UNUSED(Time_Block_ILUT_Factorization); + // gsInfo << "\n|| Setup Timings || " <get("Dirichlet")); + w_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); } ex2.setIntegrationElements(basesH); ex2.initSystem(); @@ -922,8 +935,10 @@ namespace gismo { u_n.setInterfaceCont(0); if (Base::typeBCHandling == 1) { - v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + v_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + u_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + //#u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); } ex.setIntegrationElements(basesH); ex.initSystem(); @@ -950,7 +965,8 @@ namespace gismo { w_n.setInterfaceCont(0); if (Base::typeBCHandling == 1) { - w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + w_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#w_n.addBc(m_bcInfo_ptr->get("Dirichlet")); } ex2.setIntegrationElements(basesL); ex2.initSystem(); @@ -979,8 +995,10 @@ namespace gismo { u_n.setInterfaceCont(0); if (Base::typeBCHandling == 1) { - u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); - v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + u_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + v_n.setup(*m_bcInfo_ptr, dirichlet::l2Projection, 0); + //#u_n.addBc(m_bcInfo_ptr->get("Dirichlet")); + //#v_n.addBc(m_bcInfo_ptr->get("Dirichlet")); } ex.setIntegrationElements(basesH); ex.initSystem(); diff --git a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp index b19a14255d..bb60bc8900 100644 --- a/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp +++ b/extensions/gsXBraid/examples/xbraid_heatEquation_example.cpp @@ -213,12 +213,16 @@ class gsXBraid_app : public gsXBraid< gsMatrix > space u_M = M.getSpace(basesH); u_K.setInterfaceCont(0); u_M.setInterfaceCont(0); - u_K.addBc( bc.get("Dirichlet") ); - u_M.addBc( bc.get("Dirichlet") ); + + bc.setGeoMap(mp); + u_K.setup(bc, dirichlet::l2Projection, 0); + u_M.setup(bc, dirichlet::l2Projection, 0); + //#u_K.addBc( bc.get("Dirichlet") ); + //#u_M.addBc( bc.get("Dirichlet") ); // Set the source term - variable ff_K = K.getCoeff(f, G_K); - variable ff_M = M.getCoeff(f, G_M); + auto ff_K = K.getCoeff(f, G_K); + auto ff_M = M.getCoeff(f, G_M); // Initialize and assemble the system matrix K.initSystem(); @@ -230,7 +234,8 @@ class gsXBraid_app : public gsXBraid< gsMatrix > // Enforce Neumann conditions to right-hand side variable g_Neumann = K.getBdrFunction(); - K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); + K.assembleBdr(bc.get("Neumann"), u_K * g_Neumann.val() * nv(G_K).norm() ); + //#K.assembleRhsBc(u_K * g_Neumann.val() * nv(G_K).norm(), bc.neumannSides() ); // Determine MGRIT levels a priori int numMGRITLevels = 1; @@ -276,7 +281,7 @@ class gsXBraid_app : public gsXBraid< gsMatrix > gsStopwatch clock; clock.restart(); - sol.setZero(M.numDofs()); + sol.setZero(M.numDofs(),1); switch((gsXBraid_typeMethod)typeMethod) { case gsXBraid_typeMethod::FE_FE: From 4c7898b742feda69d72ec88865148e3f438b2225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20M=C3=B6ller?= Date: Tue, 17 May 2022 07:46:21 +0200 Subject: [PATCH 174/174] Updated configuration of the XBraid p-multigrid example --- .../filedata/pde/heat2d_square_ibvp1.xml | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml index ff89b61e9c..f1de030b22 100644 --- a/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml +++ b/extensions/gsXBraid/filedata/pde/heat2d_square_ibvp1.xml @@ -98,17 +98,33 @@ - - - + + + + + 0.00000 0.00000 1.00000 1.00000 + + + 0.00000 0.00000 1.00000 1.00000 + + + 0 0 1 0 0 1 1 1 + + +