From aa1deeab86b70b75f9940b203c3226a39e60fa84 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Thu, 23 Jan 2020 18:44:40 -0500 Subject: [PATCH 01/12] Replace GPL CRC with BSD CRC (https://github.com/zlib-ng/zlib-ng/issues/42), for validation see https://github.com/neurolabusc/simd_crc --- CMakeLists.txt | 12 ++ Makefile.in | 12 -- contrib/amd64/crc32-pclmul_asm.S | 266 ------------------------------- crc32.c | 167 ++++++++++++++++++- 4 files changed, 177 insertions(+), 280 deletions(-) delete mode 100644 contrib/amd64/crc32-pclmul_asm.S diff --git a/CMakeLists.txt b/CMakeLists.txt index 74f16925b..6bfd38c5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,18 @@ else() endif() endif() +CHECK_C_COMPILER_FLAG(-mpclmul COMPILER_HAS_M_PCLMUL) +if (COMPILER_HAS_M_PCLMUL) + message( STATUS "compiler has flag pclmul") + add_definitions(-DHAS_PCLMUL) + # set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-mpclmul") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + +else() + message( STATUS "compiler does not have pclmul") +endif() + + if(MSVC) set(CMAKE_DEBUG_POSTFIX "d") add_definitions(-D_CRT_SECURE_NO_DEPRECATE) diff --git a/Makefile.in b/Makefile.in index 23c09dd43..85e2153d6 100644 --- a/Makefile.in +++ b/Makefile.in @@ -122,18 +122,6 @@ cover: infcover ./infcover gcov inf*.c -ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),) -OBJA += crc32-pclmul_asm.o -crc32-pclmul_asm.o : contrib/amd64/crc32-pclmul_asm.S - $(CC) $(CFLAGS) -c $< -o $@ -endif - -ifneq ($(findstring -DHAS_PCLMUL, $(SFLAGS)),) -PIC_OBJA += crc32-pclmul_asm.lo -crc32-pclmul_asm.lo : contrib/amd64/crc32-pclmul_asm.S - $(CC) $(SFLAGS) -c $< -o $@ -endif - libz.a: $(OBJS) $(AR) $(ARFLAGS) $@ $(OBJS) -@ ($(RANLIB) $@ || true) >/dev/null 2>&1 diff --git a/contrib/amd64/crc32-pclmul_asm.S b/contrib/amd64/crc32-pclmul_asm.S deleted file mode 100644 index d9ba1c1bd..000000000 --- a/contrib/amd64/crc32-pclmul_asm.S +++ /dev/null @@ -1,266 +0,0 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - * - * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 - * calculation. - * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) - * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found - * at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2B: Instruction Set Reference, N-Z - * - * Authors: Gregory Prestas - * Alexander Boyko - */ - -/* This file is "stolen" from linux kernel 3.14 with following minor changes to - * make it self-contained: - * - remove all header files it includes - * - define ENTRY and ENDPROC macros - * - prepend '$' to some immediate operands to make assembler happy. - */ - -#ifdef __APPLE__ -#define ENTRY(name) \ -.private_extern _ ## name; \ -_ ## name: - -#define ENDPROC(name) -#else -#define ENTRY(name) \ -.globl name; \ -.hidden name; \ -.type name, @function; \ -name: - -#define ENDPROC(name) \ -.size name, .-name -#endif - -.align 16 -/* - * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 - * #define CONSTANT_R1 0x154442bd4LL - * - * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 - * #define CONSTANT_R2 0x1c6e41596LL - */ -.Lconstant_R2R1: - .octa 0x00000001c6e415960000000154442bd4 -/* - * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 - * #define CONSTANT_R3 0x1751997d0LL - * - * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e - * #define CONSTANT_R4 0x0ccaa009eLL - */ -.Lconstant_R4R3: - .octa 0x00000000ccaa009e00000001751997d0 -/* - * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 - * #define CONSTANT_R5 0x163cd6124LL - */ -.Lconstant_R5: - .octa 0x00000000000000000000000163cd6124 -.Lconstant_mask32: - .octa 0x000000000000000000000000FFFFFFFF -/* - * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL - * - * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL - * #define CONSTANT_RU 0x1F7011641LL - */ -.Lconstant_RUpoly: - .octa 0x00000001F701164100000001DB710641 - -#define CONSTANT %xmm0 - -#ifdef __x86_64__ -#define BUF %rdi -#define LEN %rsi -#define CRC %edx -#else -#define BUF %eax -#define LEN %edx -#define CRC %ecx -#endif - - - -.text -/** - * Calculate crc32 - * BUF - buffer (16 bytes aligned) - * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 - * CRC - initial crc32 - * return %eax crc32 - * uint crc32_pclmul_le_16(unsigned char const *buffer, - * size_t len, uint crc32) - */ - -ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ - movdqa (BUF), %xmm1 - movdqa 0x10(BUF), %xmm2 - movdqa 0x20(BUF), %xmm3 - movdqa 0x30(BUF), %xmm4 - movd CRC, CONSTANT - pxor CONSTANT, %xmm1 - sub $0x40, LEN - add $0x40, BUF -#ifndef __x86_64__ - /* This is for position independent code(-fPIC) support for 32bit */ - call delta -delta: - pop %ecx -#endif - cmp $0x40, LEN - jb less_64 - -#ifdef __x86_64__ - movdqa .Lconstant_R2R1(%rip), CONSTANT -#else - movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT -#endif - -loop_64:/* 64 bytes Full cache line folding */ - prefetchnta 0x40(BUF) - movdqa %xmm1, %xmm5 - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 -#ifdef __x86_64__ - movdqa %xmm4, %xmm8 -#endif - PCLMULQDQ $00, CONSTANT, %xmm1 - PCLMULQDQ $00, CONSTANT, %xmm2 - PCLMULQDQ $00, CONSTANT, %xmm3 -#ifdef __x86_64__ - PCLMULQDQ $00, CONSTANT, %xmm4 -#endif - PCLMULQDQ $0x11, CONSTANT, %xmm5 - PCLMULQDQ $0x11, CONSTANT, %xmm6 - PCLMULQDQ $0x11, CONSTANT, %xmm7 -#ifdef __x86_64__ - PCLMULQDQ $0x11, CONSTANT, %xmm8 -#endif - pxor %xmm5, %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 -#ifdef __x86_64__ - pxor %xmm8, %xmm4 -#else - /* xmm8 unsupported for x32 */ - movdqa %xmm4, %xmm5 - PCLMULQDQ $00, CONSTANT, %xmm4 - PCLMULQDQ $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm4 -#endif - - pxor (BUF), %xmm1 - pxor 0x10(BUF), %xmm2 - pxor 0x20(BUF), %xmm3 - pxor 0x30(BUF), %xmm4 - - sub $0x40, LEN - add $0x40, BUF - cmp $0x40, LEN - jge loop_64 -less_64:/* Folding cache line into 128bit */ -#ifdef __x86_64__ - movdqa .Lconstant_R4R3(%rip), CONSTANT -#else - movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT -#endif - prefetchnta (BUF) - - movdqa %xmm1, %xmm5 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - PCLMULQDQ $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm2, %xmm1 - - movdqa %xmm1, %xmm5 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - PCLMULQDQ $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm3, %xmm1 - - movdqa %xmm1, %xmm5 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - PCLMULQDQ $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm4, %xmm1 - - cmp $0x10, LEN - jb fold_64 -loop_16:/* Folding rest buffer into 128bit */ - movdqa %xmm1, %xmm5 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - PCLMULQDQ $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor (BUF), %xmm1 - sub $0x10, LEN - add $0x10, BUF - cmp $0x10, LEN - jge loop_16 - -fold_64: - /* perform the last 64 bit fold, also adds 32 zeroes - * to the input stream */ - PCLMULQDQ $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ - psrldq $0x08, %xmm1 - pxor CONSTANT, %xmm1 - - /* final 32-bit fold */ - movdqa %xmm1, %xmm2 -#ifdef __x86_64__ - movdqa .Lconstant_R5(%rip), CONSTANT - movdqa .Lconstant_mask32(%rip), %xmm3 -#else - movdqa .Lconstant_R5 - delta(%ecx), CONSTANT - movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 -#endif - psrldq $0x04, %xmm2 - pand %xmm3, %xmm1 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - - /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ -#ifdef __x86_64__ - movdqa .Lconstant_RUpoly(%rip), CONSTANT -#else - movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT -#endif - movdqa %xmm1, %xmm2 - pand %xmm3, %xmm1 - PCLMULQDQ $0x10, CONSTANT, %xmm1 - pand %xmm3, %xmm1 - PCLMULQDQ $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - PEXTRD $0x01, %xmm1, %eax - - ret -ENDPROC(crc32_pclmul_le_16) diff --git a/crc32.c b/crc32.c index 71487873d..d3c7ec3d6 100644 --- a/crc32.c +++ b/crc32.c @@ -21,6 +21,12 @@ DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. */ +#ifdef HAS_PCLMUL + #include + #include + #include +#endif + #ifdef __aarch64__ #include @@ -268,8 +274,164 @@ local unsigned long crc32_generic(crc, buf, len) return crc ^ 0xffffffffUL; } + #ifdef HAS_PCLMUL +//https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c +/* crc32_simd.c + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + // Copyright 2015 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + /* + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer + * length must be at least 64, and a multiple of 16. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + */ + +#ifdef _MSC_VER +#define zalign(x) __declspec(align(x)) +#else +#define zalign(x) __attribute__((aligned((x)))) +#endif + +uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { + /* + * Definitions of the bit-reflected domain constants k1,k2,k3, etc and + * the CRC32+Barrett polynomials given at the end of the paper. + */ + static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; + static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; + static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; + static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; + __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + /* + * There's at least one block of 64. + */ + x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); + x0 = _mm_load_si128((__m128i *)k1k2); + buf += 64; + len -= 64; + /* + * Parallel fold blocks of 64, if any. + */ + while (len >= 64) + { + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x6 = _mm_clmulepi64_si128(x2, x0, 0x00); + x7 = _mm_clmulepi64_si128(x3, x0, 0x00); + x8 = _mm_clmulepi64_si128(x4, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x2 = _mm_clmulepi64_si128(x2, x0, 0x11); + x3 = _mm_clmulepi64_si128(x3, x0, 0x11); + x4 = _mm_clmulepi64_si128(x4, x0, 0x11); + y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x4 = _mm_xor_si128(x4, x8); + x1 = _mm_xor_si128(x1, y5); + x2 = _mm_xor_si128(x2, y6); + x3 = _mm_xor_si128(x3, y7); + x4 = _mm_xor_si128(x4, y8); + buf += 64; + len -= 64; + } + /* + * Fold into 128-bits. + */ + x0 = _mm_load_si128((__m128i *)k3k4); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x3); + x1 = _mm_xor_si128(x1, x5); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x4); + x1 = _mm_xor_si128(x1, x5); + /* + * Single fold blocks of 16, if any. + */ + while (len >= 16) + { + x2 = _mm_loadu_si128((__m128i *)buf); + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + buf += 16; + len -= 16; + } + /* + * Fold 128-bits to 64-bits. + */ + x2 = _mm_clmulepi64_si128(x1, x0, 0x10); + x3 = _mm_setr_epi32(~0, 0, ~0, 0); + x1 = _mm_srli_si128(x1, 8); + x1 = _mm_xor_si128(x1, x2); + x0 = _mm_loadl_epi64((__m128i*)k5k0); + x2 = _mm_srli_si128(x1, 4); + x1 = _mm_and_si128(x1, x3); + x1 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + /* + * Barret reduce to 32-bits. + */ + x0 = _mm_load_si128((__m128i*)poly); + x2 = _mm_and_si128(x1, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x10); + x2 = _mm_and_si128(x2, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + /* + * Return the crc32. + */ + return _mm_extract_epi32(x1, 1); + +} + #define PCLMUL_MIN_LEN 64 #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 @@ -299,8 +461,9 @@ uLong crc32(crc, buf, len) } /* Go over 16-byte chunks */ - crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), - crc ^ 0xffffffffUL); + //crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); + crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); + crc = crc ^ 0xffffffffUL; /* Handle the trailing partial chunk */ From 1f5a420b07d2dba6dc71a1663d8f47563b46081c Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Fri, 24 Jan 2020 19:15:04 -0500 Subject: [PATCH 02/12] Westmere detection --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bfd38c5a..ee5ac0d23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,13 +72,40 @@ else() endif() endif() +# Macro from on zlib-ng +# Macro to check if source compiles when cross-compiling +# or runs when compiling natively +include(CheckCSourceRuns) +macro(check_c_source_compile_or_run source flag) + if(CMAKE_CROSSCOMPILING) + check_c_source_compiles("${source}" ${flag}) + else() + check_c_source_runs("${source}" ${flag}) + endif() +endmacro() + CHECK_C_COMPILER_FLAG(-mpclmul COMPILER_HAS_M_PCLMUL) if (COMPILER_HAS_M_PCLMUL) - message( STATUS "compiler has flag pclmul") - add_definitions(-DHAS_PCLMUL) - # set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-mpclmul") + message( STATUS "compiler supports pclmul") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") - + # check from on zlib-ng + # note zlib-ng does much more thorough test of architecture + check_c_source_runs( + "#include + int main(void) + { + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i c = _mm_clmulepi64_si128(a, b, 0x10); + (void)c; + return 0; + }" + HAVE_PCLMULQDQ_INTRIN + ) + if(HAVE_PCLMULQDQ_INTRIN) + add_definitions(-DHAS_PCLMUL) + endif() + # set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-mpclmul") else() message( STATUS "compiler does not have pclmul") endif() @@ -280,3 +307,12 @@ endif() if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MSVC) set_target_properties(zlibstatic PROPERTIES STATIC_LIBRARY_FLAGS "/machine:x64") endif() + +#optional: show flags specified +MESSAGE(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) +MESSAGE(STATUS "Library Type: " ${LIB_TYPE}) +MESSAGE(STATUS "Compiler flags:" ${CMAKE_C_COMPILE_FLAGS}) +MESSAGE(STATUS "Compiler c debug flags:" ${CMAKE_C_FLAGS_DEBUG}) +MESSAGE(STATUS "Compiler c release flags:" ${CMAKE_C_FLAGS_RELEASE}) +MESSAGE(STATUS "Compiler c min size flags:" ${CMAKE_C_FLAGS_MINSIZEREL}) +MESSAGE(STATUS "Compiler c flags:" ${CMAKE_C_FLAGS}) From 2937f89d47cd617dfa2ef394209af3dc2da3cadc Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Fri, 24 Jan 2020 22:45:43 -0500 Subject: [PATCH 03/12] Update configure for Westmere (https://github.com/InsightSoftwareConsortium/ITK/issues/416) --- configure | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/configure b/configure index aa01f7f33..aa3859efa 100755 --- a/configure +++ b/configure @@ -762,15 +762,19 @@ EOF echo "CRC and SSE4.2 support is required" | tee -a configure.log leave 1 fi - - # Check for PCLMUL support +#Project copied from zlib-ng: +# Check for PCLMUL support cat > $test.c << EOF #include -void foo(void) { - _mm_clmulepi64_si128(_mm_set1_epi16(1), _mm_set1_epi16(2), 0); +#include +int main(void) { + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i c = _mm_clmulepi64_si128(a, b, 0x10); + (void)c; + return 0; } EOF - if try $CC -c -mpclmul $CFLAGS $test.c ; then CFLAGS="-DHAS_PCLMUL -mpclmul $CFLAGS" SFLAGS="-DHAS_PCLMUL -mpclmul $SFLAGS" From dd3e92408dba71b6c156a5ab9af505d2dfdfc990 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Mon, 27 Jan 2020 13:30:17 -0500 Subject: [PATCH 04/12] use cpu_has_pclmul() to autodetect CPU hardware (https://github.com/InsightSoftwareConsortium/ITK/issues/416) --- crc32.c | 62 ++++-- zconf.h.included | 511 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 554 insertions(+), 19 deletions(-) create mode 100644 zconf.h.included diff --git a/crc32.c b/crc32.c index d3c7ec3d6..c08327720 100644 --- a/crc32.c +++ b/crc32.c @@ -20,11 +20,13 @@ DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. */ - #ifdef HAS_PCLMUL - #include - #include - #include + +#include +#include +#include +//#include +#include #endif #ifdef __aarch64__ @@ -277,6 +279,29 @@ local unsigned long crc32_generic(crc, buf, len) #ifdef HAS_PCLMUL + +#ifdef HAS_GPL + extern uLong crc32_pclmul_le_16(unsigned char const *buffer, size_t len, uInt crc32); +#else + +int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test + +int has_pclmul(void) { + if (cpu_has_pclmul >= 0) + return cpu_has_pclmul; + cpu_has_pclmul = 0; + int leaf = 1; + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + /* %ecx */ + #define crc_bit_PCLMUL (1 << 1) + if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) { + //printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx); + if ((ecx & crc_bit_PCLMUL) != 0) + cpu_has_pclmul = 1; + } + return cpu_has_pclmul; +} + //https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c /* crc32_simd.c * @@ -319,14 +344,14 @@ local unsigned long crc32_generic(crc, buf, len) * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ - + #ifdef _MSC_VER #define zalign(x) __declspec(align(x)) #else #define zalign(x) __attribute__((aligned((x)))) #endif -uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { +uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) { /* * Definitions of the bit-reflected domain constants k1,k2,k3, etc and * the CRC32+Barrett polynomials given at the end of the paper. @@ -432,17 +457,12 @@ uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { } +#endif //Chromium code + #define PCLMUL_MIN_LEN 64 #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 -/* Function stolen from linux kernel 3.14. It computes the CRC over the given - * buffer with initial CRC value . The buffer is byte in length, - * and must be 16-byte aligned. - */ -extern uint crc32_pclmul_le_16(unsigned char const *buffer, - size_t len, uInt crc32); - uLong crc32(crc, buf, len) uLong crc; const Bytef *buf; @@ -450,7 +470,10 @@ uLong crc32(crc, buf, len) { if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) return crc32_generic(crc, buf, len); - + #ifndef HAS_GPL //detect whether current CPU supports PCLMUL + if (!has_pclmul()) + return crc32_generic(crc, buf, len); + #endif /* Handle the leading patial chunk */ uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf); uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN; @@ -459,11 +482,12 @@ uLong crc32(crc, buf, len) buf += sz; len -= sz; } - /* Go over 16-byte chunks */ - //crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); + #ifdef HAS_GPL + crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); + #else crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); - + #endif crc = crc ^ 0xffffffffUL; /* Handle the trailing partial chunk */ @@ -669,12 +693,12 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2) return crc32_combine_(crc1, crc2, len2); } -uLong ZEXPORT crc32_combine64(crc1, crc2, len2) +/*uLong ZEXPORT crc32_combine64(crc1, crc2, len2) uLong crc1; uLong crc2; z_off64_t len2; { return crc32_combine_(crc1, crc2, len2); -} +}*/ #endif diff --git a/zconf.h.included b/zconf.h.included new file mode 100644 index 000000000..9fa682673 --- /dev/null +++ b/zconf.h.included @@ -0,0 +1,511 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2013 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H +#include +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + * Even better than compiling with -DZ_PREFIX would be to use configure to set + * this permanently in zconf.h using "./configure --zprefix". + */ +#ifdef Z_PREFIX /* may be set to #if 1 by ./configure */ +# define Z_PREFIX_SET + +/* all linked symbols */ +# define _dist_code z__dist_code +# define _length_code z__length_code +# define _tr_align z__tr_align +# define _tr_flush_bits z__tr_flush_bits +# define _tr_flush_block z__tr_flush_block +# define _tr_init z__tr_init +# define _tr_stored_block z__tr_stored_block +# define _tr_tally z__tr_tally +# define adler32 z_adler32 +# define adler32_combine z_adler32_combine +# define adler32_combine64 z_adler32_combine64 +# ifndef Z_SOLO +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# endif +# define crc32 z_crc32 +# define crc32_combine z_crc32_combine +# define crc32_combine64 z_crc32_combine64 +# define deflate z_deflate +# define deflateBound z_deflateBound +# define deflateCopy z_deflateCopy +# define deflateEnd z_deflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateInit_ z_deflateInit_ +# define deflateParams z_deflateParams +# define deflatePending z_deflatePending +# define deflatePrime z_deflatePrime +# define deflateReset z_deflateReset +# define deflateResetKeep z_deflateResetKeep +# define deflateSetDictionary z_deflateSetDictionary +# define deflateSetHeader z_deflateSetHeader +# define deflateTune z_deflateTune +# define deflate_copyright z_deflate_copyright +# define get_crc_table z_get_crc_table +# ifndef Z_SOLO +# define gz_error z_gz_error +# define gz_intmax z_gz_intmax +# define gz_strwinerror z_gz_strwinerror +# define gzbuffer z_gzbuffer +# define gzclearerr z_gzclearerr +# define gzclose z_gzclose +# define gzclose_r z_gzclose_r +# define gzclose_w z_gzclose_w +# define gzdirect z_gzdirect +# define gzdopen z_gzdopen +# define gzeof z_gzeof +# define gzerror z_gzerror +# define gzflush z_gzflush +# define gzgetc z_gzgetc +# define gzgetc_ z_gzgetc_ +# define gzgets z_gzgets +# define gzoffset z_gzoffset +# define gzoffset64 z_gzoffset64 +# define gzopen z_gzopen +# define gzopen64 z_gzopen64 +# ifdef _WIN32 +# define gzopen_w z_gzopen_w +# endif +# define gzprintf z_gzprintf +# define gzvprintf z_gzvprintf +# define gzputc z_gzputc +# define gzputs z_gzputs +# define gzread z_gzread +# define gzrewind z_gzrewind +# define gzseek z_gzseek +# define gzseek64 z_gzseek64 +# define gzsetparams z_gzsetparams +# define gztell z_gztell +# define gztell64 z_gztell64 +# define gzungetc z_gzungetc +# define gzwrite z_gzwrite +# endif +# define inflate z_inflate +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define inflateBackInit_ z_inflateBackInit_ +# define inflateCopy z_inflateCopy +# define inflateEnd z_inflateEnd +# define inflateGetHeader z_inflateGetHeader +# define inflateInit2_ z_inflateInit2_ +# define inflateInit_ z_inflateInit_ +# define inflateMark z_inflateMark +# define inflatePrime z_inflatePrime +# define inflateReset z_inflateReset +# define inflateReset2 z_inflateReset2 +# define inflateSetDictionary z_inflateSetDictionary +# define inflateGetDictionary z_inflateGetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateUndermine z_inflateUndermine +# define inflateResetKeep z_inflateResetKeep +# define inflate_copyright z_inflate_copyright +# define inflate_fast z_inflate_fast +# define inflate_table z_inflate_table +# ifndef Z_SOLO +# define uncompress z_uncompress +# endif +# define zError z_zError +# ifndef Z_SOLO +# define zcalloc z_zcalloc +# define zcfree z_zcfree +# endif +# define zlibCompileFlags z_zlibCompileFlags +# define zlibVersion z_zlibVersion + +/* all zlib typedefs in zlib.h and zconf.h */ +# define Byte z_Byte +# define Bytef z_Bytef +# define alloc_func z_alloc_func +# define charf z_charf +# define free_func z_free_func +# ifndef Z_SOLO +# define gzFile z_gzFile +# endif +# define gz_header z_gz_header +# define gz_headerp z_gz_headerp +# define in_func z_in_func +# define intf z_intf +# define out_func z_out_func +# define uInt z_uInt +# define uIntf z_uIntf +# define uLong z_uLong +# define uLongf z_uLongf +# define voidp z_voidp +# define voidpc z_voidpc +# define voidpf z_voidpf + +/* all zlib structs in zlib.h and zconf.h */ +# define gz_header_s z_gz_header_s +# define internal_state z_internal_state + +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +#if defined(ZLIB_CONST) && !defined(z_const) +# define z_const const +#else +# define z_const +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +#ifndef Z_ARG /* function prototypes for stdarg */ +# if defined(STDC) || defined(Z_HAVE_STDARG_H) +# define Z_ARG(args) args +# else +# define Z_ARG(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# include + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef uint8_t Byte; /* 8 bits */ +#endif +typedef uint32_t uInt; /* 32 bits */ +typedef uint64_t uLong; /* 64 bits */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC) +# include +# if (UINT_MAX == 0xffffffffUL) +# define Z_U4 unsigned +# elif (ULONG_MAX == 0xffffffffUL) +# define Z_U4 unsigned long +# elif (USHRT_MAX == 0xffffffffUL) +# define Z_U4 unsigned short +# endif +#endif + +#ifdef Z_U4 + typedef Z_U4 z_crc_t; +#else + typedef unsigned long z_crc_t; +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_UNISTD_H +#endif + +#if 1 /* was set to #if 1 by ./configure */ +# define Z_HAVE_STDARG_H +#endif + +#ifdef STDC +# ifndef Z_SOLO +# include /* for off_t */ +# endif +#endif + +#if defined(STDC) || defined(Z_HAVE_STDARG_H) +# ifndef Z_SOLO +# include /* for va_list */ +# endif +#endif + +#ifdef _WIN32 +# ifndef Z_SOLO +# include /* for wchar_t */ +# endif +#endif + +/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and + * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even + * though the former does not conform to the LFS document), but considering + * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as + * equivalently requesting no 64-bit operations + */ +#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1 +# undef _LARGEFILE64_SOURCE +#endif + +#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H) +# define Z_HAVE_UNISTD_H +#endif +#ifndef Z_SOLO +# if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef VMS +# include /* for off_t */ +# endif +# ifndef z_off_t +# define z_off_t off_t +# endif +# endif +#endif + +#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0 +# define Z_LFS64 +#endif + +#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64) +# define Z_LARGE64 +#endif + +#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64) +# define Z_WANT64 +#endif + +#if !defined(SEEK_SET) && !defined(Z_SOLO) +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif + +#ifndef z_off_t +# define z_off_t long +#endif + +#if !defined(_WIN32) && defined(Z_LARGE64) +# define z_off64_t off64_t +#else +# if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO) +# define z_off64_t __int64 +# else +# define z_off64_t z_off_t +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) + #pragma map(deflateInit_,"DEIN") + #pragma map(deflateInit2_,"DEIN2") + #pragma map(deflateEnd,"DEEND") + #pragma map(deflateBound,"DEBND") + #pragma map(inflateInit_,"ININ") + #pragma map(inflateInit2_,"ININ2") + #pragma map(inflateEnd,"INEND") + #pragma map(inflateSync,"INSY") + #pragma map(inflateSetDictionary,"INSEDI") + #pragma map(compressBound,"CMBND") + #pragma map(inflate_table,"INTABL") + #pragma map(inflate_fast,"INFA") + #pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */ From b3064e0a1059ea5b9a6c78a889bdf55637ccd1d2 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Mon, 27 Jan 2020 16:10:42 -0500 Subject: [PATCH 05/12] remove gpl code --- crc32.c | 86 +++++++++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/crc32.c b/crc32.c index c08327720..0db1fe21e 100644 --- a/crc32.c +++ b/crc32.c @@ -20,13 +20,12 @@ DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h. */ -#ifdef HAS_PCLMUL -#include -#include -#include -//#include -#include +#ifdef HAS_PCLMUL + #include + #include + #include + #include #endif #ifdef __aarch64__ @@ -279,29 +278,6 @@ local unsigned long crc32_generic(crc, buf, len) #ifdef HAS_PCLMUL - -#ifdef HAS_GPL - extern uLong crc32_pclmul_le_16(unsigned char const *buffer, size_t len, uInt crc32); -#else - -int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test - -int has_pclmul(void) { - if (cpu_has_pclmul >= 0) - return cpu_has_pclmul; - cpu_has_pclmul = 0; - int leaf = 1; - uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; - /* %ecx */ - #define crc_bit_PCLMUL (1 << 1) - if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) { - //printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx); - if ((ecx & crc_bit_PCLMUL) != 0) - cpu_has_pclmul = 1; - } - return cpu_has_pclmul; -} - //https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c /* crc32_simd.c * @@ -344,14 +320,14 @@ int has_pclmul(void) { * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ - + #ifdef _MSC_VER #define zalign(x) __declspec(align(x)) #else #define zalign(x) __attribute__((aligned((x)))) #endif -uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) { +uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { /* * Definitions of the bit-reflected domain constants k1,k2,k3, etc and * the CRC32+Barrett polynomials given at the end of the paper. @@ -457,23 +433,44 @@ uLong crc32_simd(unsigned char const *buf, size_t len, uInt crc) { } -#endif //Chromium code - #define PCLMUL_MIN_LEN 64 #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 +int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test + +int has_pclmul(void) { + if (cpu_has_pclmul >= 0) + return cpu_has_pclmul; + cpu_has_pclmul = 0; + int leaf = 1; + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + /* %ecx */ + #define crc_bit_PCLMUL (1 << 1) + if (__get_cpuid(leaf, &eax, &ebx, &ecx, &edx)) { + //printf("leaf=%d, eax=0x%x, ebx=0x%x, ecx=0x%x, edx=0x%x\n", leaf, eax, ebx, ecx, edx); + if ((ecx & crc_bit_PCLMUL) != 0) + cpu_has_pclmul = 1; + } + return cpu_has_pclmul; +} + + +/* Function stolen from linux kernel 3.14. It computes the CRC over the given + * buffer with initial CRC value . The buffer is byte in length, + * and must be 16-byte aligned. + */ +extern uint crc32_pclmul_le_16(unsigned char const *buffer, + size_t len, uInt crc32); + uLong crc32(crc, buf, len) uLong crc; const Bytef *buf; uInt len; { - if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) - return crc32_generic(crc, buf, len); - #ifndef HAS_GPL //detect whether current CPU supports PCLMUL - if (!has_pclmul()) + if ((len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) || (!has_pclmul())) return crc32_generic(crc, buf, len); - #endif + /* Handle the leading patial chunk */ uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf); uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN; @@ -482,12 +479,11 @@ uLong crc32(crc, buf, len) buf += sz; len -= sz; } + /* Go over 16-byte chunks */ - #ifdef HAS_GPL - crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); - #else + //crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); - #endif + crc = crc ^ 0xffffffffUL; /* Handle the trailing partial chunk */ @@ -693,12 +689,12 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2) return crc32_combine_(crc1, crc2, len2); } -/*uLong ZEXPORT crc32_combine64(crc1, crc2, len2) +uLong ZEXPORT crc32_combine64(crc1, crc2, len2) uLong crc1; uLong crc2; z_off64_t len2; { return crc32_combine_(crc1, crc2, len2); -}*/ +} -#endif +#endif \ No newline at end of file From 4c77bc72e28c1f9d43a400ba97dea95e1d4e805e Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Tue, 28 Jan 2020 09:04:52 -0500 Subject: [PATCH 06/12] Improve support for compiling using Windows (https://github.com/ningfei/zlib) --- CMakeLists.txt | 388 +++++++++++++++++++----------------------------- deflate.c | 4 +- zconf.h.cmakein | 16 +- zconf.h.in | 8 +- zlib.pc.cmakein | 12 +- 5 files changed, 173 insertions(+), 255 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee5ac0d23..02176ffa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,142 +1,98 @@ -cmake_minimum_required(VERSION 2.4.4) -set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS ON) +cmake_minimum_required(VERSION 2.8.12) project(zlib C) -set(VERSION "1.2.8") - -option(ASM686 "Enable building i686 assembly implementation") -option(AMD64 "Enable building amd64 assembly implementation") - -set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables") -set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries") -set(INSTALL_INC_DIR "${CMAKE_INSTALL_PREFIX}/include" CACHE PATH "Installation directory for headers") -set(INSTALL_MAN_DIR "${CMAKE_INSTALL_PREFIX}/share/man" CACHE PATH "Installation directory for manual pages") -set(INSTALL_PKGCONFIG_DIR "${CMAKE_INSTALL_PREFIX}/share/pkgconfig" CACHE PATH "Installation directory for pkgconfig (.pc) files") - +include(CheckIncludeFile) include(CheckTypeSize) include(CheckFunctionExists) -include(CheckIncludeFile) -include(CheckCSourceCompiles) -enable_testing() +# Check include files check_include_file(sys/types.h HAVE_SYS_TYPES_H) check_include_file(stdint.h HAVE_STDINT_H) check_include_file(stddef.h HAVE_STDDEF_H) +check_include_file(unistd.h HAVE_UNISTD_H) -# -# Check to see if we have large file support -# -set(CMAKE_REQUIRED_DEFINITIONS -D_LARGEFILE64_SOURCE=1) -# We add these other definitions here because CheckTypeSize.cmake -# in CMake 2.4.x does not automatically do so and we want -# compatibility with CMake 2.4.x. -if(HAVE_SYS_TYPES_H) - list(APPEND CMAKE_REQUIRED_DEFINITIONS -DHAVE_SYS_TYPES_H) -endif() -if(HAVE_STDINT_H) - list(APPEND CMAKE_REQUIRED_DEFINITIONS -DHAVE_STDINT_H) -endif() -if(HAVE_STDDEF_H) - list(APPEND CMAKE_REQUIRED_DEFINITIONS -DHAVE_STDDEF_H) -endif() -check_type_size(off64_t OFF64_T) -if(HAVE_OFF64_T) - add_definitions(-D_LARGEFILE64_SOURCE=1) -endif() -set(CMAKE_REQUIRED_DEFINITIONS) # clear variable +# Build with large file support +add_definitions(-D_LARGEFILE64_SOURCE=1) -# -# Check for fseeko -# -check_function_exists(fseeko HAVE_FSEEKO) -if(NOT HAVE_FSEEKO) - add_definitions(-DNO_FSEEKO) +# Build type setting +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug;Release;RelWithDebInfo;MinSizeRel") endif() -# -# Check for unistd.h -# -check_include_file(unistd.h Z_HAVE_UNISTD_H) +# Build options +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_EXAMPLES "Build examples" OFF) -include (CheckCCompilerFlag) -if(MSVC) - CHECK_C_COMPILER_FLAG("/arch:AVX" HAS_AVX) - if (HAS_AVX) - set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "/arch:AVX") - endif() -else() - CHECK_C_COMPILER_FLAG("-msse4.2" HAS_SSE) - if (HAS_SSE) - set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-msse4.2") - endif() -endif() +# Set -fPIC option +set(CMAKE_POSITION_INDEPENDENT_CODE ON) -# Macro from on zlib-ng -# Macro to check if source compiles when cross-compiling -# or runs when compiling natively -include(CheckCSourceRuns) -macro(check_c_source_compile_or_run source flag) - if(CMAKE_CROSSCOMPILING) - check_c_source_compiles("${source}" ${flag}) - else() - check_c_source_runs("${source}" ${flag}) - endif() -endmacro() +# parse the full version number from zlib.h and include in ZLIB_VERSION +file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents) +string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" + "\\1" ZLIB_VERSION ${_zlib_h_contents}) -CHECK_C_COMPILER_FLAG(-mpclmul COMPILER_HAS_M_PCLMUL) -if (COMPILER_HAS_M_PCLMUL) - message( STATUS "compiler supports pclmul") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") - # check from on zlib-ng - # note zlib-ng does much more thorough test of architecture - check_c_source_runs( - "#include - int main(void) - { - __m128i a = _mm_setzero_si128(); - __m128i b = _mm_setzero_si128(); - __m128i c = _mm_clmulepi64_si128(a, b, 0x10); - (void)c; - return 0; - }" - HAVE_PCLMULQDQ_INTRIN - ) - if(HAVE_PCLMULQDQ_INTRIN) - add_definitions(-DHAS_PCLMUL) - endif() - # set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-mpclmul") -else() - message( STATUS "compiler does not have pclmul") +# Generate zlib.pc +set(ZLIB_PC ${CMAKE_CURRENT_BINARY_DIR}/zlib.pc) +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/zlib.pc.cmakein + ${ZLIB_PC} @ONLY) + +# Generate zcon.h +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.cmakein + ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY) +include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}) + +# Mark OSX settings as advanced +if(APPLE) + mark_as_advanced(CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT) + set(CMAKE_MACOSX_RPATH TRUE) endif() +# Option to use static runtime +include(ucm.cmake) +option(USE_STATIC_RUNTIME "Use static runtime" ON) +if(USE_STATIC_RUNTIME) + ucm_set_runtime(STATIC) +else() + ucm_set_runtime(DYNAMIC) +endif() -if(MSVC) +# Compiler dependent flags +include (CheckCCompilerFlag) +if(UNIX) + check_c_compiler_flag(-msse4.2 HAS_SSE42) + if(HAS_SSE42) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") + add_definitions(-DHAS_SSE42) + endif() +elseif(MSVC) set(CMAKE_DEBUG_POSTFIX "d") add_definitions(-D_CRT_SECURE_NO_DEPRECATE) add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) - include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -endif() -if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR) - # If we're doing an out of source build and the user has a zconf.h - # in their source tree... - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h) - message(STATUS "Renaming") - message(STATUS " ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h") - message(STATUS "to 'zconf.h.included' because this file is included with zlib") - message(STATUS "but CMake generates it automatically in the build directory.") - file(RENAME ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.included) - endif() + check_c_compiler_flag(/arch:AVX HAS_AVX) + if (HAS_AVX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX") + endif() endif() -set(ZLIB_PC ${CMAKE_CURRENT_BINARY_DIR}/zlib.pc) -configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/zlib.pc.cmakein - ${ZLIB_PC} @ONLY) -configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/zconf.h.cmakein - ${CMAKE_CURRENT_BINARY_DIR}/zconf.h @ONLY) -include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}) - +# Assembly setting +if(UNIX) + check_c_compiler_flag(-mpclmul HAS_PCLMUL) + if(HAS_PCLMUL) + set(ENABLE_ASSEMBLY "PCLMUL" CACHE STRING "Choose assembly implementation.") + set_property(CACHE ENABLE_ASSEMBLY PROPERTY STRINGS "OFF;PCLMUL") + + if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL") + set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S) + set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + add_definitions(-DHAS_PCLMUL) + endif() + endif() +endif() #============================================================================ # zlib @@ -175,144 +131,100 @@ set(ZLIB_SRCS zutil.c ) -if(NOT MINGW) - set(ZLIB_DLL_SRCS - win32/zlib1.rc # If present will override custom build rule below. - ) -endif() - -if(CMAKE_COMPILER_IS_GNUCC) - if(ASM686) - set(ZLIB_ASMS contrib/asm686/match.S) - elseif (AMD64) - set(ZLIB_ASMS contrib/amd64/amd64-match.S) - endif () - - if(ZLIB_ASMS) - add_definitions(-DASMV) - set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) - endif() -endif() - -if(MSVC) - if(ASM686) - ENABLE_LANGUAGE(ASM_MASM) - set(ZLIB_ASMS - contrib/masmx86/inffas32.asm - contrib/masmx86/match686.asm - ) - elseif (AMD64) - ENABLE_LANGUAGE(ASM_MASM) - set(ZLIB_ASMS - contrib/masmx64/gvmat64.asm - contrib/masmx64/inffasx64.asm - ) +if(BUILD_SHARED_LIBS) + # Visibility + check_c_compiler_flag(-fvisibility=hidden HAVE_HIDDEN) + if(HAVE_HIDDEN) + add_definitions(-DHAVE_HIDDEN) endif() - if(ZLIB_ASMS) - add_definitions(-DASMV -DASMINF) - endif() -endif() - -# parse the full version number from zlib.h and include in ZLIB_FULL_VERSION -file(READ ${CMAKE_CURRENT_SOURCE_DIR}/zlib.h _zlib_h_contents) -string(REGEX REPLACE ".*#define[ \t]+ZLIB_VERSION[ \t]+\"([-0-9A-Za-z.]+)\".*" - "\\1" ZLIB_FULL_VERSION ${_zlib_h_contents}) - -if(MINGW) - # This gets us DLL resource information when compiling on MinGW. - if(NOT CMAKE_RC_COMPILER) - set(CMAKE_RC_COMPILER windres.exe) + # DLL resource setting + if(NOT MINGW) + set(ZLIB_DLL_SRCS + win32/zlib1.rc # If present will override custom build rule below. + ) + else() + # This gets us DLL resource information when compiling on MinGW. + if(NOT CMAKE_RC_COMPILER) + set(CMAKE_RC_COMPILER windres.exe) + endif() + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj + COMMAND ${CMAKE_RC_COMPILER} + -D GCC_WINDRES + -I ${CMAKE_CURRENT_SOURCE_DIR} + -I ${CMAKE_CURRENT_BINARY_DIR} + -o ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj + -i ${CMAKE_CURRENT_SOURCE_DIR}/win32/zlib1.rc) + set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) endif() - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj - COMMAND ${CMAKE_RC_COMPILER} - -D GCC_WINDRES - -I ${CMAKE_CURRENT_SOURCE_DIR} - -I ${CMAKE_CURRENT_BINARY_DIR} - -o ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj - -i ${CMAKE_CURRENT_SOURCE_DIR}/win32/zlib1.rc) - set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) -endif(MINGW) - -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) -set_target_properties(zlib PROPERTIES SOVERSION 1) + add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) + set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) + set_target_properties(zlib PROPERTIES SOVERSION 1) + + if(NOT CYGWIN) + # This property causes shared libraries on Linux to have the full version + # encoded into their final filename. We disable this on Cygwin because + # it causes cygz-${ZLIB_VERSION}.dll to be created when cygz.dll + # seems to be the default. + # + # This has no effect with MSVC, on that platform the version info for + # the DLL comes from the resource file win32/zlib1.rc + set_target_properties(zlib PROPERTIES VERSION ${ZLIB_VERSION}) + endif() -if(NOT CYGWIN) - # This property causes shared libraries on Linux to have the full version - # encoded into their final filename. We disable this on Cygwin because - # it causes cygz-${ZLIB_FULL_VERSION}.dll to be created when cygz.dll - # seems to be the default. - # - # This has no effect with MSVC, on that platform the version info for - # the DLL comes from the resource file win32/zlib1.rc - set_target_properties(zlib PROPERTIES VERSION ${ZLIB_FULL_VERSION}) + if(UNIX) + # On unix-like platforms the library is almost always called libz + set_target_properties(zlib PROPERTIES OUTPUT_NAME z) + if(NOT APPLE) + set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") + endif() + elseif(WIN32) + # Creates zlib1.dll when building shared library version + set_target_properties(zlib PROPERTIES SUFFIX "1.dll") + endif() +else() + add_library(zlib STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) + if(UNIX) + set_target_properties(zlib PROPERTIES OUTPUT_NAME z) + endif() + #============================================================================ + # work around to CMake bug which affects 64-bit Windows + # see http://public.kitware.com/Bug/view.php?id=11240 + #============================================================================ + if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MSVC) + set_target_properties(zlib PROPERTIES STATIC_LIBRARY_FLAGS "/machine:x64") + endif() endif() -if(UNIX) - # On unix-like platforms the library is almost always called libz - set_target_properties(zlib zlibstatic PROPERTIES OUTPUT_NAME z) - if(NOT APPLE) - set_target_properties(zlib PROPERTIES LINK_FLAGS "-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib.map\"") - endif() -elseif(BUILD_SHARED_LIBS AND WIN32) - # Creates zlib1.dll when building shared library version - set_target_properties(zlib PROPERTIES SUFFIX "1.dll") +if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL) + install(TARGETS zlib + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib) endif() -if(NOT SKIP_INSTALL_LIBRARIES AND NOT SKIP_INSTALL_ALL ) - install(TARGETS zlib zlibstatic - RUNTIME DESTINATION "${INSTALL_BIN_DIR}" - ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" - LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ) -endif() -if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL ) - install(FILES ${ZLIB_PUBLIC_HDRS} DESTINATION "${INSTALL_INC_DIR}") +if(NOT SKIP_INSTALL_HEADERS AND NOT SKIP_INSTALL_ALL) + install(FILES ${ZLIB_PUBLIC_HDRS} DESTINATION include) endif() -if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) - install(FILES zlib.3 DESTINATION "${INSTALL_MAN_DIR}/man3") -endif() -if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL ) - install(FILES ${ZLIB_PC} DESTINATION "${INSTALL_PKGCONFIG_DIR}") + +if(NOT SKIP_INSTALL_FILES AND NOT SKIP_INSTALL_ALL) + install(FILES zlib.3 DESTINATION share/man/man3) + install(FILES ${ZLIB_PC} DESTINATION lib/pkgconfig) endif() #============================================================================ # Example binaries #============================================================================ -add_executable(example test/example.c) -target_link_libraries(example zlib) -add_test(example example) - -add_executable(minigzip test/minigzip.c) -target_link_libraries(minigzip zlib) - -if(HAVE_OFF64_T) - add_executable(example64 test/example.c) - target_link_libraries(example64 zlib) - set_target_properties(example64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") - add_test(example64 example64) - - add_executable(minigzip64 test/minigzip.c) - target_link_libraries(minigzip64 zlib) - set_target_properties(minigzip64 PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") -endif() - -#============================================================================ -# work around to CMake bug which affects 64-bit Windows -# see http://public.kitware.com/Bug/view.php?id=11240 -#============================================================================ -if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MSVC) - set_target_properties(zlibstatic PROPERTIES STATIC_LIBRARY_FLAGS "/machine:x64") -endif() +if(BUILD_EXAMPLES) + add_executable(example test/example.c) + target_link_libraries(example zlib) + set_target_properties(example PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") + add_test(example example) -#optional: show flags specified -MESSAGE(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) -MESSAGE(STATUS "Library Type: " ${LIB_TYPE}) -MESSAGE(STATUS "Compiler flags:" ${CMAKE_C_COMPILE_FLAGS}) -MESSAGE(STATUS "Compiler c debug flags:" ${CMAKE_C_FLAGS_DEBUG}) -MESSAGE(STATUS "Compiler c release flags:" ${CMAKE_C_FLAGS_RELEASE}) -MESSAGE(STATUS "Compiler c min size flags:" ${CMAKE_C_FLAGS_MINSIZEREL}) -MESSAGE(STATUS "Compiler c flags:" ${CMAKE_C_FLAGS}) + add_executable(minigzip test/minigzip.c) + target_link_libraries(minigzip zlib) + set_target_properties(minigzip PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") +endif() \ No newline at end of file diff --git a/deflate.c b/deflate.c index 0e3b2667d..b143fc651 100644 --- a/deflate.c +++ b/deflate.c @@ -142,7 +142,7 @@ static uint32_t hash_func(deflate_state *s, void* str) { return __crc32cw(0, *(uint32_t*)str) & s->hash_mask; } -#elif defined __x86_64__ +#elif defined __x86_64__ || defined _M_AMD64 #include static uint32_t hash_func(deflate_state *s, void* str) { @@ -1360,7 +1360,7 @@ static void fill_window(s) q+=8; } -#elif defined __x86_64__ +#elif defined __x86_64__ || defined _M_AMD64 __m128i W; __m128i *q; diff --git a/zconf.h.cmakein b/zconf.h.cmakein index 47f65f43a..6b2282036 100644 --- a/zconf.h.cmakein +++ b/zconf.h.cmakein @@ -7,8 +7,6 @@ #ifndef ZCONF_H #define ZCONF_H -#cmakedefine Z_PREFIX -#cmakedefine Z_HAVE_UNISTD_H #include /* * If you *really* need a unique prefix for all types and library functions, @@ -367,10 +365,10 @@ #endif #if !defined(__MACTYPES__) -typedef uint8_t Byte; /* 8 bits */ +typedef uint8_t Byte; /* 8 bits */ #endif -typedef uint32_t uInt; -typedef uint64_t uLong; +typedef uint32_t uInt; /* 32 bits */ +typedef uint64_t uLong; /* 64 bits */ #ifdef SMALL_MEDIUM /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ @@ -451,7 +449,11 @@ typedef uLong FAR uLongf; #endif #ifndef Z_SOLO # if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) -# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef _WIN32 /* _MSC_VER doesn't work for some reason when building dll*/ +# include +# else +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# endif # ifdef VMS # include /* for off_t */ # endif @@ -510,4 +512,4 @@ typedef uLong FAR uLongf; #pragma map(inflate_copyright,"INCOPY") #endif -#endif /* ZCONF_H */ +#endif /* ZCONF_H */ \ No newline at end of file diff --git a/zconf.h.in b/zconf.h.in index 0878c8316..7faa2580b 100644 --- a/zconf.h.in +++ b/zconf.h.in @@ -449,7 +449,11 @@ typedef uLong FAR uLongf; #endif #ifndef Z_SOLO # if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE) -# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# ifdef _MSC_VER +# include +# else +# include /* for SEEK_*, off_t, and _LFS64_LARGEFILE */ +# endif # ifdef VMS # include /* for off_t */ # endif @@ -508,4 +512,4 @@ typedef uLong FAR uLongf; #pragma map(inflate_copyright,"INCOPY") #endif -#endif /* ZCONF_H */ +#endif /* ZCONF_H */ \ No newline at end of file diff --git a/zlib.pc.cmakein b/zlib.pc.cmakein index a5e642938..08fa39be8 100644 --- a/zlib.pc.cmakein +++ b/zlib.pc.cmakein @@ -1,13 +1,13 @@ prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=@CMAKE_INSTALL_PREFIX@ -libdir=@INSTALL_LIB_DIR@ -sharedlibdir=@INSTALL_LIB_DIR@ -includedir=@INSTALL_INC_DIR@ +exec_prefix=@CMAKE_INSTALL_PREFIX@/bin +libdir=@CMAKE_INSTALL_PREFIX@/lib +sharedlibdir=@CMAKE_INSTALL_PREFIX@/lib +includedir=@CMAKE_INSTALL_PREFIX@/include Name: zlib Description: zlib compression library -Version: @VERSION@ +Version: @ZLIB_VERSION@ Requires: Libs: -L${libdir} -L${sharedlibdir} -lz -Cflags: -I${includedir} +Cflags: -I${includedir} \ No newline at end of file From 4bf55a28f8d5fc494b8e81765b454fe9dc5e2dab Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Tue, 28 Jan 2020 09:19:27 -0500 Subject: [PATCH 07/12] Import ucm.cmake from https://github.com/ningfei/zlib --- CMakeLists.txt | 2 +- ucm.cmake | 636 ++++++++++++++++++++++++++++++++++++++++++++++++ zconf.h.cmakein | 2 +- zconf.h.in | 2 +- zlib.pc.cmakein | 2 +- 5 files changed, 640 insertions(+), 4 deletions(-) create mode 100644 ucm.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 02176ffa6..0d29c326c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,4 +227,4 @@ if(BUILD_EXAMPLES) add_executable(minigzip test/minigzip.c) target_link_libraries(minigzip zlib) set_target_properties(minigzip PROPERTIES COMPILE_FLAGS "-D_FILE_OFFSET_BITS=64") -endif() \ No newline at end of file +endif() diff --git a/ucm.cmake b/ucm.cmake new file mode 100644 index 000000000..0b362cd6d --- /dev/null +++ b/ucm.cmake @@ -0,0 +1,636 @@ +# +# ucm.cmake - useful cmake macros +# +# Copyright (c) 2016 Viktor Kirilov +# +# Distributed under the MIT Software License +# See accompanying file LICENSE.txt or copy at +# https://opensource.org/licenses/MIT +# +# The documentation can be found at the library's page: +# https://github.com/onqtam/ucm + +cmake_minimum_required(VERSION 2.8.12) + +include(CMakeParseArguments) + +# optionally include cotire - the git submodule might not be inited (or the user might have already included it) +if(NOT COMMAND cotire) + include(${CMAKE_CURRENT_LIST_DIR}/../cotire/CMake/cotire.cmake OPTIONAL) +endif() + +if(COMMAND cotire AND "1.7.9" VERSION_LESS "${COTIRE_CMAKE_MODULE_VERSION}") + set(ucm_with_cotire 1) +else() + set(ucm_with_cotire 0) +endif() + +# option(UCM_UNITY_BUILD "Enable unity build for targets registered with the ucm_add_target() macro" OFF) +# option(UCM_NO_COTIRE_FOLDER "Do not use a cotire folder in the solution explorer for all unity and cotire related targets" ON) + +# ucm_add_flags +# Adds compiler flags to CMAKE__FLAGS or to a specific config +macro(ucm_add_flags) + cmake_parse_arguments(ARG "C;CXX;CLEAR_OLD" "" "CONFIG" ${ARGN}) + + if(NOT ARG_CONFIG) + set(ARG_CONFIG " ") + endif() + + foreach(CONFIG ${ARG_CONFIG}) + # determine to which flags to add + if(NOT ${CONFIG} STREQUAL " ") + string(TOUPPER ${CONFIG} CONFIG) + set(CXX_FLAGS CMAKE_CXX_FLAGS_${CONFIG}) + set(C_FLAGS CMAKE_C_FLAGS_${CONFIG}) + else() + set(CXX_FLAGS CMAKE_CXX_FLAGS) + set(C_FLAGS CMAKE_C_FLAGS) + endif() + + # clear the old flags + if(${ARG_CLEAR_OLD}) + if("${ARG_CXX}" OR NOT "${ARG_C}") + set(${CXX_FLAGS} "") + endif() + if("${ARG_C}" OR NOT "${ARG_CXX}") + set(${C_FLAGS} "") + endif() + endif() + + # add all the passed flags + foreach(flag ${ARG_UNPARSED_ARGUMENTS}) + if("${ARG_CXX}" OR NOT "${ARG_C}") + set(${CXX_FLAGS} "${${CXX_FLAGS}} ${flag}") + endif() + if("${ARG_C}" OR NOT "${ARG_CXX}") + set(${C_FLAGS} "${${C_FLAGS}} ${flag}") + endif() + endforeach() + endforeach() + +endmacro() + +# ucm_set_flags +# Sets the CMAKE__FLAGS compiler flags or for a specific config +macro(ucm_set_flags) + ucm_add_flags(CLEAR_OLD ${ARGN}) +endmacro() + +# ucm_add_linker_flags +# Adds linker flags to CMAKE__LINKER_FLAGS or to a specific config +macro(ucm_add_linker_flags) + cmake_parse_arguments(ARG "CLEAR_OLD;EXE;MODULE;SHARED;STATIC" "" "CONFIG" ${ARGN}) + + if(NOT ARG_CONFIG) + set(ARG_CONFIG " ") + endif() + + foreach(CONFIG ${ARG_CONFIG}) + string(TOUPPER "${CONFIG}" CONFIG) + + if(NOT ${ARG_EXE} AND NOT ${ARG_MODULE} AND NOT ${ARG_SHARED} AND NOT ${ARG_STATIC}) + set(ARG_EXE 1) + set(ARG_MODULE 1) + set(ARG_SHARED 1) + set(ARG_STATIC 1) + endif() + + set(flags_configs "") + if(${ARG_EXE}) + if(NOT "${CONFIG}" STREQUAL " ") + list(APPEND flags_configs CMAKE_EXE_LINKER_FLAGS_${CONFIG}) + else() + list(APPEND flags_configs CMAKE_EXE_LINKER_FLAGS) + endif() + endif() + if(${ARG_MODULE}) + if(NOT "${CONFIG}" STREQUAL " ") + list(APPEND flags_configs CMAKE_MODULE_LINKER_FLAGS_${CONFIG}) + else() + list(APPEND flags_configs CMAKE_MODULE_LINKER_FLAGS) + endif() + endif() + if(${ARG_SHARED}) + if(NOT "${CONFIG}" STREQUAL " ") + list(APPEND flags_configs CMAKE_SHARED_LINKER_FLAGS_${CONFIG}) + else() + list(APPEND flags_configs CMAKE_SHARED_LINKER_FLAGS) + endif() + endif() + if(${ARG_STATIC}) + if(NOT "${CONFIG}" STREQUAL " ") + list(APPEND flags_configs CMAKE_STATIC_LINKER_FLAGS_${CONFIG}) + else() + list(APPEND flags_configs CMAKE_STATIC_LINKER_FLAGS) + endif() + endif() + + # clear the old flags + if(${ARG_CLEAR_OLD}) + foreach(flags ${flags_configs}) + set(${flags} "") + endforeach() + endif() + + # add all the passed flags + foreach(flag ${ARG_UNPARSED_ARGUMENTS}) + foreach(flags ${flags_configs}) + set(${flags} "${${flags}} ${flag}") + endforeach() + endforeach() + endforeach() +endmacro() + +# ucm_set_linker_flags +# Sets the CMAKE__LINKER_FLAGS linker flags or for a specific config +macro(ucm_set_linker_flags) + ucm_add_linker_flags(CLEAR_OLD ${ARGN}) +endmacro() + +# ucm_gather_flags +# Gathers all lists of flags for printing or manipulation +macro(ucm_gather_flags with_linker result) + set(${result} "") + # add the main flags without a config + list(APPEND ${result} CMAKE_C_FLAGS) + list(APPEND ${result} CMAKE_CXX_FLAGS) + if(${with_linker}) + list(APPEND ${result} CMAKE_EXE_LINKER_FLAGS) + list(APPEND ${result} CMAKE_MODULE_LINKER_FLAGS) + list(APPEND ${result} CMAKE_SHARED_LINKER_FLAGS) + list(APPEND ${result} CMAKE_STATIC_LINKER_FLAGS) + endif() + + if("${CMAKE_CONFIGURATION_TYPES}" STREQUAL "" AND NOT "${CMAKE_BUILD_TYPE}" STREQUAL "") + # handle single config generators - like makefiles/ninja - when CMAKE_BUILD_TYPE is set + string(TOUPPER ${CMAKE_BUILD_TYPE} config) + list(APPEND ${result} CMAKE_C_FLAGS_${config}) + list(APPEND ${result} CMAKE_CXX_FLAGS_${config}) + if(${with_linker}) + list(APPEND ${result} CMAKE_EXE_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_MODULE_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_SHARED_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_STATIC_LINKER_FLAGS_${config}) + endif() + else() + # handle multi config generators (like msvc, xcode) + foreach(config ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER ${config} config) + list(APPEND ${result} CMAKE_C_FLAGS_${config}) + list(APPEND ${result} CMAKE_CXX_FLAGS_${config}) + if(${with_linker}) + list(APPEND ${result} CMAKE_EXE_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_MODULE_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_SHARED_LINKER_FLAGS_${config}) + list(APPEND ${result} CMAKE_STATIC_LINKER_FLAGS_${config}) + endif() + endforeach() + endif() +endmacro() + +# ucm_set_runtime +# Sets the runtime (static/dynamic) for msvc/gcc +macro(ucm_set_runtime) + cmake_parse_arguments(ARG "STATIC;DYNAMIC" "" "" ${ARGN}) + + if(ARG_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" STREQUAL "") + message(AUTHOR_WARNING "ucm_set_runtime() does not support clang yet!") + endif() + + ucm_gather_flags(0 flags_configs) + + # add/replace the flags + # note that if the user has messed with the flags directly this function might fail + # - for example if with MSVC and the user has removed the flags - here we just switch/replace them + if("${ARG_STATIC}") + foreach(flags ${flags_configs}) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4.7) # option "-static-libstdc++" available since GCC 4.5 + if(NOT ${flags} MATCHES "-static-libstdc\\+\\+") + set(${flags} "${${flags}} -static-libstdc++") + endif() + endif() + if(NOT ${flags} MATCHES "-static-libgcc") + set(${flags} "${${flags}} -static-libgcc") + endif() + elseif(MSVC) + if(${flags} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flags} "${${flags}}") + endif() + endif() + endforeach() + elseif("${ARG_DYNAMIC}") + foreach(flags ${flags_configs}) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + if(${flags} MATCHES "-static-libstdc\\+\\+") + string(REGEX REPLACE "-static-libstdc\\+\\+" "" ${flags} "${${flags}}") + endif() + if(${flags} MATCHES "-static-libgcc") + string(REGEX REPLACE "-static-libgcc" "" ${flags} "${${flags}}") + endif() + elseif(MSVC) + if(${flags} MATCHES "/MT") + string(REGEX REPLACE "/MT" "/MD" ${flags} "${${flags}}") + endif() + endif() + endforeach() + endif() +endmacro() + +# ucm_print_flags +# Prints all compiler flags for all configurations +macro(ucm_print_flags) + ucm_gather_flags(1 flags_configs) + message("") + foreach(flags ${flags_configs}) + message("${flags}: ${${flags}}") + endforeach() + message("") +endmacro() + +# ucm_count_sources +# Counts the number of source files +macro(ucm_count_sources) + cmake_parse_arguments(ARG "" "RESULT" "" ${ARGN}) + if(${ARG_RESULT} STREQUAL "") + message(FATAL_ERROR "Need to pass RESULT and a variable name to ucm_count_sources()") + endif() + + set(result 0) + foreach(SOURCE_FILE ${ARG_UNPARSED_ARGUMENTS}) + if("${SOURCE_FILE}" MATCHES \\.\(c|C|cc|cp|cpp|CPP|c\\+\\+|cxx|i|ii\)$) + math(EXPR result "${result} + 1") + endif() + endforeach() + set(${ARG_RESULT} ${result}) +endmacro() + +# ucm_include_file_in_sources +# Includes the file to the source with compiler flags +macro(ucm_include_file_in_sources) + cmake_parse_arguments(ARG "" "HEADER" "" ${ARGN}) + if(${ARG_HEADER} STREQUAL "") + message(FATAL_ERROR "Need to pass HEADER and a header file to ucm_include_file_in_sources()") + endif() + + foreach(src ${ARG_UNPARSED_ARGUMENTS}) + if(${src} MATCHES \\.\(c|C|cc|cp|cpp|CPP|c\\+\\+|cxx\)$) + # get old flags + get_source_file_property(old_compile_flags ${src} COMPILE_FLAGS) + if(old_compile_flags STREQUAL "NOTFOUND") + set(old_compile_flags "") + endif() + + # update flags + if(MSVC) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS + "${old_compile_flags} /FI\"${CMAKE_CURRENT_SOURCE_DIR}/${ARG_HEADER}\"") + else() + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS + "${old_compile_flags} -include \"${CMAKE_CURRENT_SOURCE_DIR}/${ARG_HEADER}\"") + endif() + endif() + endforeach() +endmacro() + +# ucm_dir_list +# Returns a list of subdirectories for a given directory +macro(ucm_dir_list thedir result) + file(GLOB sub-dir "${thedir}/*") + set(list_of_dirs "") + foreach(dir ${sub-dir}) + if(IS_DIRECTORY ${dir}) + get_filename_component(DIRNAME ${dir} NAME) + LIST(APPEND list_of_dirs ${DIRNAME}) + endif() + endforeach() + set(${result} ${list_of_dirs}) +endmacro() + +# ucm_trim_front_words +# Trims X times the front word from a string separated with "/" and removes +# the front "/" characters after that (used for filters for visual studio) +macro(ucm_trim_front_words source out num_filter_trims) + set(result "${source}") + set(counter 0) + while(${counter} LESS ${num_filter_trims}) + MATH(EXPR counter "${counter} + 1") + # removes everything at the front up to a "/" character + string(REGEX REPLACE "^([^/]+)" "" result "${result}") + # removes all consecutive "/" characters from the front + string(REGEX REPLACE "^(/+)" "" result "${result}") + endwhile() + set(${out} ${result}) +endmacro() + +# ucm_remove_files +# Removes source files from a list of sources (path is the relative path for it to be found) +macro(ucm_remove_files) + cmake_parse_arguments(ARG "" "FROM" "" ${ARGN}) + + if("${ARG_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "Need to pass some relative files to ucm_remove_files()") + endif() + if(${ARG_FROM} STREQUAL "") + message(FATAL_ERROR "Need to pass FROM and a variable name to ucm_remove_files()") + endif() + + foreach(cur_file ${ARG_UNPARSED_ARGUMENTS}) + list(REMOVE_ITEM ${ARG_FROM} ${cur_file}) + endforeach() +endmacro() + +# ucm_remove_directories +# Removes all source files from the given directories from the sources list +macro(ucm_remove_directories) + cmake_parse_arguments(ARG "" "FROM" "MATCHES" ${ARGN}) + + if("${ARG_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "Need to pass some relative directories to ucm_remove_directories()") + endif() + if(${ARG_FROM} STREQUAL "") + message(FATAL_ERROR "Need to pass FROM and a variable name to ucm_remove_directories()") + endif() + + foreach(cur_dir ${ARG_UNPARSED_ARGUMENTS}) + foreach(cur_file ${${ARG_FROM}}) + string(REGEX MATCH ${cur_dir} res ${cur_file}) + if(NOT "${res}" STREQUAL "") + if("${ARG_MATCHES}" STREQUAL "") + list(REMOVE_ITEM ${ARG_FROM} ${cur_file}) + else() + foreach(curr_ptrn ${ARG_MATCHES}) + string(REGEX MATCH ${curr_ptrn} res ${cur_file}) + if(NOT "${res}" STREQUAL "") + list(REMOVE_ITEM ${ARG_FROM} ${cur_file}) + break() + endif() + endforeach() + endif() + endif() + endforeach() + endforeach() +endmacro() + +# ucm_add_files_impl +macro(ucm_add_files_impl result trim files) + foreach(cur_file ${files}) + SET(${result} ${${result}} ${cur_file}) + get_filename_component(FILEPATH ${cur_file} PATH) + ucm_trim_front_words("${FILEPATH}" FILEPATH "${trim}") + # replacing forward slashes with back slashes so filters can be generated (back slash used in parsing...) + STRING(REPLACE "/" "\\" FILTERS "${FILEPATH}") + SOURCE_GROUP("${FILTERS}" FILES ${cur_file}) + endforeach() +endmacro() + +# ucm_add_files +# Adds files to a list of sources +macro(ucm_add_files) + cmake_parse_arguments(ARG "" "TO;FILTER_POP" "" ${ARGN}) + + if("${ARG_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "Need to pass some relative files to ucm_add_files()") + endif() + if(${ARG_TO} STREQUAL "") + message(FATAL_ERROR "Need to pass TO and a variable name to ucm_add_files()") + endif() + + if("${ARG_FILTER_POP}" STREQUAL "") + set(ARG_FILTER_POP 0) + endif() + + ucm_add_files_impl(${ARG_TO} ${ARG_FILTER_POP} "${ARG_UNPARSED_ARGUMENTS}") +endmacro() + +# ucm_add_dir_impl +macro(ucm_add_dir_impl result rec trim dirs_in additional_ext) + set(dirs "${dirs_in}") + + # handle the "" and "." cases + if("${dirs}" STREQUAL "" OR "${dirs}" STREQUAL ".") + set(dirs "./") + endif() + + foreach(cur_dir ${dirs}) + # to circumvent some linux/cmake/path issues - barely made it work... + if(cur_dir STREQUAL "./") + set(cur_dir "") + else() + set(cur_dir "${cur_dir}/") + endif() + + # since unix is case sensitive - add these valid extensions too + # we don't use "UNIX" but instead "CMAKE_HOST_UNIX" because we might be cross + # compiling (for example emscripten) under windows and UNIX may be set to 1 + # Also OSX is case insensitive like windows... + set(additional_file_extensions "") + if(CMAKE_HOST_UNIX AND NOT APPLE) + set(additional_file_extensions + "${cur_dir}*.CPP" + "${cur_dir}*.C" + "${cur_dir}*.H" + "${cur_dir}*.HPP" + ) + endif() + + foreach(ext ${additional_ext}) + list(APPEND additional_file_extensions "${cur_dir}*.${ext}") + endforeach() + + # find all sources and set them as result + FILE(GLOB found_sources RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + # https://gcc.gnu.org/onlinedocs/gcc-4.4.1/gcc/Overall-Options.html#index-file-name-suffix-71 + # sources + "${cur_dir}*.cpp" + "${cur_dir}*.cxx" + "${cur_dir}*.c++" + "${cur_dir}*.cc" + "${cur_dir}*.cp" + "${cur_dir}*.c" + "${cur_dir}*.i" + "${cur_dir}*.ii" + # headers + "${cur_dir}*.h" + "${cur_dir}*.h++" + "${cur_dir}*.hpp" + "${cur_dir}*.hxx" + "${cur_dir}*.hh" + "${cur_dir}*.inl" + "${cur_dir}*.inc" + "${cur_dir}*.ipp" + "${cur_dir}*.ixx" + "${cur_dir}*.txx" + "${cur_dir}*.tpp" + "${cur_dir}*.tcc" + "${cur_dir}*.tpl" + ${additional_file_extensions}) + SET(${result} ${${result}} ${found_sources}) + + # set the proper filters + ucm_trim_front_words("${cur_dir}" cur_dir "${trim}") + # replacing forward slashes with back slashes so filters can be generated (back slash used in parsing...) + STRING(REPLACE "/" "\\" FILTERS "${cur_dir}") + SOURCE_GROUP("${FILTERS}" FILES ${found_sources}) + endforeach() + + if(${rec}) + foreach(cur_dir ${dirs}) + ucm_dir_list("${cur_dir}" subdirs) + foreach(subdir ${subdirs}) + ucm_add_dir_impl(${result} ${rec} ${trim} "${cur_dir}/${subdir}" "${additional_ext}") + endforeach() + endforeach() + endif() +endmacro() + +# ucm_add_dirs +# Adds all files from directories traversing them recursively to a list of sources +# and generates filters according to their location (accepts relative paths only). +# Also this macro trims X times the front word from the filter string for visual studio filters. +macro(ucm_add_dirs) + cmake_parse_arguments(ARG "RECURSIVE" "TO;FILTER_POP" "ADDITIONAL_EXT" ${ARGN}) + + if(${ARG_TO} STREQUAL "") + message(FATAL_ERROR "Need to pass TO and a variable name to ucm_add_dirs()") + endif() + + if("${ARG_FILTER_POP}" STREQUAL "") + set(ARG_FILTER_POP 0) + endif() + + ucm_add_dir_impl(${ARG_TO} ${ARG_RECURSIVE} ${ARG_FILTER_POP} "${ARG_UNPARSED_ARGUMENTS}" "${ARG_ADDITIONAL_EXT}") +endmacro() + +# ucm_add_target +# Adds a target eligible for cotiring - unity build and/or precompiled header +macro(ucm_add_target) + cmake_parse_arguments(ARG "UNITY" "NAME;TYPE;PCH_FILE;CPP_PER_UNITY" "UNITY_EXCLUDED;SOURCES" ${ARGN}) + + if(NOT "${ARG_UNPARSED_ARGUMENTS}" STREQUAL "") + message(FATAL_ERROR "Unrecognized options passed to ucm_add_target()") + endif() + if("${ARG_NAME}" STREQUAL "") + message(FATAL_ERROR "Need to pass NAME and a name for the target to ucm_add_target()") + endif() + set(valid_types EXECUTABLE STATIC SHARED MODULE) + list(FIND valid_types "${ARG_TYPE}" is_type_valid) + if(${is_type_valid} STREQUAL "-1") + message(FATAL_ERROR "Need to pass TYPE and the type for the target [EXECUTABLE/STATIC/SHARED/MODULE] to ucm_add_target()") + endif() + if("${ARG_SOURCES}" STREQUAL "") + message(FATAL_ERROR "Need to pass SOURCES and a list of source files to ucm_add_target()") + endif() + + # init with the global unity flag + set(do_unity ${UCM_UNITY_BUILD}) + + # check the UNITY argument + if(NOT ARG_UNITY) + set(do_unity FALSE) + endif() + + # if target is excluded through the exclusion list + list(FIND UCM_UNITY_BUILD_EXCLUDE_TARGETS ${ARG_NAME} is_target_excluded) + if(NOT ${is_target_excluded} STREQUAL "-1") + set(do_unity FALSE) + endif() + + # unity build only for targets with > 1 source file (otherwise there will be an additional unnecessary target) + if(do_unity) # optimization + ucm_count_sources(${ARG_SOURCES} RESULT num_sources) + if(${num_sources} LESS 2) + set(do_unity FALSE) + endif() + endif() + + set(wanted_cotire ${do_unity}) + + # if cotire cannot be used + if(do_unity AND NOT ucm_with_cotire) + set(do_unity FALSE) + endif() + + # inform the developer that the current target might benefit from a unity build + if(NOT ARG_UNITY AND ${UCM_UNITY_BUILD}) + ucm_count_sources(${ARG_SOURCES} RESULT num_sources) + if(${num_sources} GREATER 1) + message(AUTHOR_WARNING "Target '${ARG_NAME}' may benefit from a unity build.\nIt has ${num_sources} sources - enable with UNITY flag") + endif() + endif() + + # prepare for the unity build + set(orig_target ${ARG_NAME}) + if(do_unity) + # the original target will be added with a different name than the requested + set(orig_target ${ARG_NAME}_ORIGINAL) + + # exclude requested files from unity build of the current target + foreach(excluded_file "${ARG_UNITY_EXCLUDED}") + set_source_files_properties(${excluded_file} PROPERTIES COTIRE_EXCLUDED TRUE) + endforeach() + endif() + + # add the original target + if(${ARG_TYPE} STREQUAL "EXECUTABLE") + add_executable(${orig_target} ${ARG_SOURCES}) + else() + add_library(${orig_target} ${ARG_TYPE} ${ARG_SOURCES}) + endif() + + if(do_unity) + # set the number of unity cpp files to be used for the unity target + if(NOT "${ARG_CPP_PER_UNITY}" STREQUAL "") + set_property(TARGET ${orig_target} PROPERTY COTIRE_UNITY_SOURCE_MAXIMUM_NUMBER_OF_INCLUDES "${ARG_CPP_PER_UNITY}") + else() + set_property(TARGET ${orig_target} PROPERTY COTIRE_UNITY_SOURCE_MAXIMUM_NUMBER_OF_INCLUDES "100") + endif() + + if(NOT "${ARG_PCH_FILE}" STREQUAL "") + set_target_properties(${orig_target} PROPERTIES COTIRE_CXX_PREFIX_HEADER_INIT "${ARG_PCH_FILE}") + else() + set_target_properties(${orig_target} PROPERTIES COTIRE_ENABLE_PRECOMPILED_HEADER FALSE) + endif() + # add a unity target for the original one with the name intended for the original + set_target_properties(${orig_target} PROPERTIES COTIRE_UNITY_TARGET_NAME ${ARG_NAME}) + + # this is the library call that does the magic + cotire(${orig_target}) + set_target_properties(clean_cotire PROPERTIES FOLDER "CMakePredefinedTargets") + + # disable the original target and enable the unity one + get_target_property(unity_target_name ${orig_target} COTIRE_UNITY_TARGET_NAME) + set_target_properties(${orig_target} PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1) + set_target_properties(${unity_target_name} PROPERTIES EXCLUDE_FROM_ALL 0 EXCLUDE_FROM_DEFAULT_BUILD 0) + + # also set the name of the target output as the original one + set_target_properties(${unity_target_name} PROPERTIES OUTPUT_NAME ${ARG_NAME}) + if(UCM_NO_COTIRE_FOLDER) + # reset the folder property so all unity targets dont end up in a single folder in the solution explorer of VS + set_target_properties(${unity_target_name} PROPERTIES FOLDER "") + endif() + set_target_properties(all_unity PROPERTIES FOLDER "CMakePredefinedTargets") + elseif(NOT "${ARG_PCH_FILE}" STREQUAL "") + set(wanted_cotire TRUE) + if(ucm_with_cotire) + set_target_properties(${orig_target} PROPERTIES COTIRE_ADD_UNITY_BUILD FALSE) + set_target_properties(${orig_target} PROPERTIES COTIRE_CXX_PREFIX_HEADER_INIT "${ARG_PCH_FILE}") + cotire(${orig_target}) + set_target_properties(clean_cotire PROPERTIES FOLDER "CMakePredefinedTargets") + endif() + endif() + + # print a message if the target was requested to be cotired but it couldn't + if(wanted_cotire AND NOT ucm_with_cotire) + if(NOT COMMAND cotire) + message(AUTHOR_WARNING "Target \"${ARG_NAME}\" not cotired because cotire isn't loaded") + else() + message(AUTHOR_WARNING "Target \"${ARG_NAME}\" not cotired because cotire is older than the required version") + endif() + endif() +endmacro() diff --git a/zconf.h.cmakein b/zconf.h.cmakein index 6b2282036..605ac8b19 100644 --- a/zconf.h.cmakein +++ b/zconf.h.cmakein @@ -512,4 +512,4 @@ typedef uLong FAR uLongf; #pragma map(inflate_copyright,"INCOPY") #endif -#endif /* ZCONF_H */ \ No newline at end of file +#endif /* ZCONF_H */ diff --git a/zconf.h.in b/zconf.h.in index 7faa2580b..7f6921c0e 100644 --- a/zconf.h.in +++ b/zconf.h.in @@ -512,4 +512,4 @@ typedef uLong FAR uLongf; #pragma map(inflate_copyright,"INCOPY") #endif -#endif /* ZCONF_H */ \ No newline at end of file +#endif /* ZCONF_H */ diff --git a/zlib.pc.cmakein b/zlib.pc.cmakein index 08fa39be8..3017c6955 100644 --- a/zlib.pc.cmakein +++ b/zlib.pc.cmakein @@ -10,4 +10,4 @@ Version: @ZLIB_VERSION@ Requires: Libs: -L${libdir} -L${sharedlibdir} -lz -Cflags: -I${includedir} \ No newline at end of file +Cflags: -I${includedir} From 59c0c27261a0c20c9e751b71f0ecce6cb352385b Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Wed, 29 Jan 2020 12:34:21 -0500 Subject: [PATCH 08/12] crc32_simd as separate file (https://github.com/cloudflare/zlib/pull/18) --- CMakeLists.txt | 33 +++---- Makefile.in | 8 +- crc32.c | 163 +-------------------------------- crc32_simd.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++ crc32_simd.h | 69 ++++++++++++++ 5 files changed, 339 insertions(+), 178 deletions(-) create mode 100755 crc32_simd.c create mode 100755 crc32_simd.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d29c326c..05273e05e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,22 +78,6 @@ elseif(MSVC) endif() endif() -# Assembly setting -if(UNIX) - check_c_compiler_flag(-mpclmul HAS_PCLMUL) - if(HAS_PCLMUL) - set(ENABLE_ASSEMBLY "PCLMUL" CACHE STRING "Choose assembly implementation.") - set_property(CACHE ENABLE_ASSEMBLY PROPERTY STRINGS "OFF;PCLMUL") - - if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL") - set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S) - set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") - add_definitions(-DHAS_PCLMUL) - endif() - endif() -endif() - #============================================================================ # zlib #============================================================================ @@ -131,6 +115,23 @@ set(ZLIB_SRCS zutil.c ) +# append "crc_simd.c" and compile with "mpclmul" if supported by compiler +if(UNIX) + check_c_compiler_flag(-mpclmul HAS_PCLMUL) + if(HAS_PCLMUL) + set(ENABLE_ASSEMBLY "PCLMUL" CACHE STRING "Choose assembly implementation.") + set_property(CACHE ENABLE_ASSEMBLY PROPERTY STRINGS "OFF;PCLMUL") + + if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL") + #set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S) + #set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + list(APPEND ZLIB_SRCS crc32_simd.c) + add_definitions(-DHAS_PCLMUL) + endif() + endif() +endif() + if(BUILD_SHARED_LIBS) # Visibility check_c_compiler_flag(-fvisibility=hidden HAVE_HIDDEN) diff --git a/Makefile.in b/Makefile.in index 85e2153d6..5cbc87c09 100644 --- a/Makefile.in +++ b/Makefile.in @@ -54,10 +54,16 @@ pkgconfigdir = ${libdir}/pkgconfig OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o -OBJC = $(OBJZ) $(OBJG) PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo + +ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),) + OBJZ += crc32_simd.o + PIC_OBJZ += crc32_simd.lo +endif + +OBJC = $(OBJZ) $(OBJG) PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) # to use the asm code: make OBJA=match.o, PIC_OBJA=match.lo diff --git a/crc32.c b/crc32.c index 0db1fe21e..1ede402cc 100644 --- a/crc32.c +++ b/crc32.c @@ -22,9 +22,7 @@ */ #ifdef HAS_PCLMUL - #include - #include - #include + #include "crc32_simd.h" #include #endif @@ -278,161 +276,6 @@ local unsigned long crc32_generic(crc, buf, len) #ifdef HAS_PCLMUL -//https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c -/* crc32_simd.c - * - * Copyright 2017 The Chromium Authors. All rights reserved. - * Use of this source code is governed by a BSD-style license that can be - * found in the Chromium source repository LICENSE file. - */ - // Copyright 2015 The Chromium Authors. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - /* - * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer - * length must be at least 64, and a multiple of 16. Based on: - * - * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" - * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 - */ - -#ifdef _MSC_VER -#define zalign(x) __declspec(align(x)) -#else -#define zalign(x) __attribute__((aligned((x)))) -#endif - -uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) { - /* - * Definitions of the bit-reflected domain constants k1,k2,k3, etc and - * the CRC32+Barrett polynomials given at the end of the paper. - */ - static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; - static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; - static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; - static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; - __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; - /* - * There's at least one block of 64. - */ - x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); - x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); - x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); - x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); - x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); - x0 = _mm_load_si128((__m128i *)k1k2); - buf += 64; - len -= 64; - /* - * Parallel fold blocks of 64, if any. - */ - while (len >= 64) - { - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x6 = _mm_clmulepi64_si128(x2, x0, 0x00); - x7 = _mm_clmulepi64_si128(x3, x0, 0x00); - x8 = _mm_clmulepi64_si128(x4, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x2 = _mm_clmulepi64_si128(x2, x0, 0x11); - x3 = _mm_clmulepi64_si128(x3, x0, 0x11); - x4 = _mm_clmulepi64_si128(x4, x0, 0x11); - y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); - y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); - y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); - y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_xor_si128(x2, x6); - x3 = _mm_xor_si128(x3, x7); - x4 = _mm_xor_si128(x4, x8); - x1 = _mm_xor_si128(x1, y5); - x2 = _mm_xor_si128(x2, y6); - x3 = _mm_xor_si128(x3, y7); - x4 = _mm_xor_si128(x4, y8); - buf += 64; - len -= 64; - } - /* - * Fold into 128-bits. - */ - x0 = _mm_load_si128((__m128i *)k3k4); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x2); - x1 = _mm_xor_si128(x1, x5); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x3); - x1 = _mm_xor_si128(x1, x5); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x4); - x1 = _mm_xor_si128(x1, x5); - /* - * Single fold blocks of 16, if any. - */ - while (len >= 16) - { - x2 = _mm_loadu_si128((__m128i *)buf); - x5 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_clmulepi64_si128(x1, x0, 0x11); - x1 = _mm_xor_si128(x1, x2); - x1 = _mm_xor_si128(x1, x5); - buf += 16; - len -= 16; - } - /* - * Fold 128-bits to 64-bits. - */ - x2 = _mm_clmulepi64_si128(x1, x0, 0x10); - x3 = _mm_setr_epi32(~0, 0, ~0, 0); - x1 = _mm_srli_si128(x1, 8); - x1 = _mm_xor_si128(x1, x2); - x0 = _mm_loadl_epi64((__m128i*)k5k0); - x2 = _mm_srli_si128(x1, 4); - x1 = _mm_and_si128(x1, x3); - x1 = _mm_clmulepi64_si128(x1, x0, 0x00); - x1 = _mm_xor_si128(x1, x2); - /* - * Barret reduce to 32-bits. - */ - x0 = _mm_load_si128((__m128i*)poly); - x2 = _mm_and_si128(x1, x3); - x2 = _mm_clmulepi64_si128(x2, x0, 0x10); - x2 = _mm_and_si128(x2, x3); - x2 = _mm_clmulepi64_si128(x2, x0, 0x00); - x1 = _mm_xor_si128(x1, x2); - /* - * Return the crc32. - */ - return _mm_extract_epi32(x1, 1); - -} - #define PCLMUL_MIN_LEN 64 #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 @@ -455,7 +298,6 @@ int has_pclmul(void) { return cpu_has_pclmul; } - /* Function stolen from linux kernel 3.14. It computes the CRC over the given * buffer with initial CRC value . The buffer is byte in length, * and must be 16-byte aligned. @@ -482,8 +324,7 @@ uLong crc32(crc, buf, len) /* Go over 16-byte chunks */ //crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); - crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); - + crc = crc32_sse42_simd_(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); crc = crc ^ 0xffffffffUL; /* Handle the trailing partial chunk */ diff --git a/crc32_simd.c b/crc32_simd.c new file mode 100755 index 000000000..cc68d4fc3 --- /dev/null +++ b/crc32_simd.c @@ -0,0 +1,244 @@ +/* crc32_simd.c + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + + +#include "crc32_simd.h" + +#if defined(CRC32_SIMD_SSE42_PCLMUL) + +/* + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer + * length must be at least 64, and a multiple of 16. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + */ + +#include +#include +#include + +uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ + const unsigned char *buf, + z_size_t len, + uint32_t crc) +{ + /* + * Definitions of the bit-reflected domain constants k1,k2,k3, etc and + * the CRC32+Barrett polynomials given at the end of the paper. + */ + static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; + static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; + static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; + static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; + + __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + + /* + * There's at least one block of 64. + */ + x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + + x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); + + x0 = _mm_load_si128((__m128i *)k1k2); + + buf += 64; + len -= 64; + + /* + * Parallel fold blocks of 64, if any. + */ + while (len >= 64) + { + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x6 = _mm_clmulepi64_si128(x2, x0, 0x00); + x7 = _mm_clmulepi64_si128(x3, x0, 0x00); + x8 = _mm_clmulepi64_si128(x4, x0, 0x00); + + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x2 = _mm_clmulepi64_si128(x2, x0, 0x11); + x3 = _mm_clmulepi64_si128(x3, x0, 0x11); + x4 = _mm_clmulepi64_si128(x4, x0, 0x11); + + y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x4 = _mm_xor_si128(x4, x8); + + x1 = _mm_xor_si128(x1, y5); + x2 = _mm_xor_si128(x2, y6); + x3 = _mm_xor_si128(x3, y7); + x4 = _mm_xor_si128(x4, y8); + + buf += 64; + len -= 64; + } + + /* + * Fold into 128-bits. + */ + x0 = _mm_load_si128((__m128i *)k3k4); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x3); + x1 = _mm_xor_si128(x1, x5); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x4); + x1 = _mm_xor_si128(x1, x5); + + /* + * Single fold blocks of 16, if any. + */ + while (len >= 16) + { + x2 = _mm_loadu_si128((__m128i *)buf); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + + buf += 16; + len -= 16; + } + + /* + * Fold 128-bits to 64-bits. + */ + x2 = _mm_clmulepi64_si128(x1, x0, 0x10); + x3 = _mm_setr_epi32(~0, 0, ~0, 0); + x1 = _mm_srli_si128(x1, 8); + x1 = _mm_xor_si128(x1, x2); + + x0 = _mm_loadl_epi64((__m128i*)k5k0); + + x2 = _mm_srli_si128(x1, 4); + x1 = _mm_and_si128(x1, x3); + x1 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + + /* + * Barret reduce to 32-bits. + */ + x0 = _mm_load_si128((__m128i*)poly); + + x2 = _mm_and_si128(x1, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x10); + x2 = _mm_and_si128(x2, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + + /* + * Return the crc32. + */ + return _mm_extract_epi32(x1, 1); +} + +#elif defined(CRC32_ARMV8_CRC32) + +/* CRC32 checksums using ARMv8-a crypto instructions. + * + * TODO: implement a version using the PMULL instruction. + */ + +#if defined(__clang__) +/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an + * armv8 target, which is incompatible with ThinLTO optimizations on Android. + * (Namely, mixing and matching different module-level targets makes ThinLTO + * warn, and Android defaults to armv7-a. This restriction does not apply to + * function-level `target`s, however.) + * + * Since we only need four crc intrinsics, and since clang's implementation of + * those are just wrappers around compiler builtins, it's simplest to #define + * those builtins directly. If this #define list grows too much (or we depend on + * an intrinsic that isn't a trivial wrapper), we may have to find a better way + * to go about this. + * + * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized + * feature for this target (ignoring feature)." This appears to be a harmless + * bug in clang. + */ +#define __crc32b __builtin_arm_crc32b +#define __crc32d __builtin_arm_crc32d +#define __crc32w __builtin_arm_crc32w +#define __crc32cw __builtin_arm_crc32cw + +#if defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) +#else // !defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) +#endif // defined(__aarch64__) + +#elif defined(__GNUC__) +/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not + * allowed. We can just include arm_acle.h. + */ +#include +#define TARGET_ARMV8_WITH_CRC +#else // !defined(__GNUC__) && !defined(_aarch64__) +#error ARM CRC32 SIMD extensions only supported for Clang and GCC +#endif + +TARGET_ARMV8_WITH_CRC +uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, + const unsigned char *buf, + z_size_t len) +{ + uint32_t c = (uint32_t) ~crc; + + while (len && ((uintptr_t)buf & 7)) { + c = __crc32b(c, *buf++); + --len; + } + + const uint64_t *buf8 = (const uint64_t *)buf; + + while (len >= 64) { + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + len -= 64; + } + + while (len >= 8) { + c = __crc32d(c, *buf8++); + len -= 8; + } + + buf = (const unsigned char *)buf8; + + while (len--) { + c = __crc32b(c, *buf++); + } + + return ~c; +} + +#endif \ No newline at end of file diff --git a/crc32_simd.h b/crc32_simd.h new file mode 100755 index 000000000..c8b350151 --- /dev/null +++ b/crc32_simd.h @@ -0,0 +1,69 @@ +//https://cs.chromium.org/chromium/src/third_party/zlib/crc32_simd.c +/* crc32_simd.h + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ +#ifndef CRC32_SIMD_H +#define CRC32_SIMD_H + + +#include + +//#include "zconf.h" +//#include "zutil.h" +#include "deflate.h" + +//#ifndef local +// #define local static +//#endif + +//#ifndef z_crc_t +// #ifdef Z_U4 +// typedef Z_U4 z_crc_t; +// #else +// typedef unsigned long z_crc_t; +// #endif +//#endif +#ifdef HAS_PCLMUL + #define CRC32_SIMD_SSE42_PCLMUL +#endif + +#ifndef z_size_t + #define z_size_t size_t +#endif +#ifndef zalign + #ifdef _MSC_VER + #define zalign(x) __declspec(align(x)) + #else + #define zalign(x) __attribute__((aligned((x)))) + #endif +#endif + + + +/* + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer + * length must be at least 64, and a multiple of 16. + */ +uint32_t ZLIB_INTERNAL crc32_sse42_simd_( + const unsigned char *buf, + z_size_t len, + uint32_t crc); + +/* + * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c + * for computing the crc32 of an arbitrary length buffer. + */ +#define Z_CRC32_SSE42_MINIMUM_LENGTH 64 +#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15 + +/* + * CRC32 checksums using ARMv8-a crypto instructions. + */ +uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, + const unsigned char* buf, + z_size_t len); + +#endif /* CRC32_SIMD_H */ \ No newline at end of file From dc28f91427b498ba3946b3c8a7bfe319edd5fb04 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Fri, 31 Jan 2020 12:53:38 -0500 Subject: [PATCH 09/12] atomic and SKIP_CPUID_CHECK (https://github.com/cloudflare/zlib/pull/18) --- crc32.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crc32.c b/crc32.c index 1ede402cc..2ed625b97 100644 --- a/crc32.c +++ b/crc32.c @@ -280,7 +280,7 @@ local unsigned long crc32_generic(crc, buf, len) #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 -int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test +_Atomic int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test int has_pclmul(void) { if (cpu_has_pclmul >= 0) @@ -310,9 +310,12 @@ uLong crc32(crc, buf, len) const Bytef *buf; uInt len; { - if ((len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) || (!has_pclmul())) + if (len < PCLMUL_MIN_LEN + PCLMUL_ALIGN - 1) return crc32_generic(crc, buf, len); - + #ifndef SKIP_CPUID_CHECK + if (!has_pclmul()) + return crc32_generic(crc, buf, len); + #endif /* Handle the leading patial chunk */ uInt misalign = PCLMUL_ALIGN_MASK & ((unsigned long)buf); uInt sz = (PCLMUL_ALIGN - misalign) % PCLMUL_ALIGN; From b73befcc858ea4c5af1854af19be393de608b98e Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Sat, 1 Feb 2020 21:46:08 -0500 Subject: [PATCH 10/12] Remove unused code --- crc32.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/crc32.c b/crc32.c index 2ed625b97..28843762d 100644 --- a/crc32.c +++ b/crc32.c @@ -298,13 +298,6 @@ int has_pclmul(void) { return cpu_has_pclmul; } -/* Function stolen from linux kernel 3.14. It computes the CRC over the given - * buffer with initial CRC value . The buffer is byte in length, - * and must be 16-byte aligned. - */ -extern uint crc32_pclmul_le_16(unsigned char const *buffer, - size_t len, uInt crc32); - uLong crc32(crc, buf, len) uLong crc; const Bytef *buf; @@ -326,7 +319,6 @@ uLong crc32(crc, buf, len) } /* Go over 16-byte chunks */ - //crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); crc = crc32_sse42_simd_(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL); crc = crc ^ 0xffffffffUL; From df5c4fe4b66b09833c64d64a9e1de6d42fab6b25 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Fri, 7 Feb 2020 08:14:00 -0500 Subject: [PATCH 11/12] Only allow compiler to use clmul instructions to crc_simd unit (https://github.com/jtkukunas/zlib/pull/25) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 05273e05e..de7e04f5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,7 +125,8 @@ if(UNIX) if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL") #set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S) #set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul") + set_source_files_properties(crc32_simd.c PROPERTIES COMPILE_FLAGS -mpclmul) list(APPEND ZLIB_SRCS crc32_simd.c) add_definitions(-DHAS_PCLMUL) endif() From c99aa7542dafff0c803d7858e195710032259de4 Mon Sep 17 00:00:00 2001 From: neurolabusc Date: Wed, 12 Feb 2020 12:15:02 -0500 Subject: [PATCH 12/12] Atomic does not compile on Ubuntu 14.04 --- crc32.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crc32.c b/crc32.c index 28843762d..c3f16b9f1 100644 --- a/crc32.c +++ b/crc32.c @@ -280,7 +280,15 @@ local unsigned long crc32_generic(crc, buf, len) #define PCLMUL_ALIGN 16 #define PCLMUL_ALIGN_MASK 15 -_Atomic int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test +#if defined(__GNUC__) + #if __GNUC__ < 5 + int cpu_has_pclmul = -1; //e.g. gcc 4.8.4 https://stackoverflow.com/questions/20326604/stdatomic-h-in-gcc-4-8 + #else + _Atomic int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test + #endif +#else + _Atomic int cpu_has_pclmul = -1; //global: will be 0 or 1 after first test +#endif int has_pclmul(void) { if (cpu_has_pclmul >= 0)