Permalink
Cannot retrieve contributors at this time
197 lines (174 sloc)
7.8 KB
| // Copyright 2021 The Wuffs Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // https://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| // -------- | |
| // See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC | |
| // Computation for Generic Polynomials Using PCLMULQDQ Instruction". | |
| pri func ieee_hasher.up_x86_sse42!(x: slice base.u8), | |
| choose cpu_arch >= x86_sse42, | |
| { | |
| var s : base.u32 | |
| var p : slice base.u8 | |
| var util : base.x86_sse42_utility | |
| var k : base.x86_m128i | |
| var x0 : base.x86_m128i | |
| var x1 : base.x86_m128i | |
| var x2 : base.x86_m128i | |
| var x3 : base.x86_m128i | |
| var y0 : base.x86_m128i | |
| var y1 : base.x86_m128i | |
| var y2 : base.x86_m128i | |
| var y3 : base.x86_m128i | |
| var tail_index : base.u64 | |
| s = 0xFFFF_FFFF ^ this.state | |
| // Align to a 16-byte boundary. | |
| while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) { | |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8) | |
| args.x = args.x[1 ..] | |
| } endwhile | |
| // For short inputs, just do a simple loop. | |
| if args.x.length() < 64 { | |
| iterate (p = args.x)(length: 1, advance: 1, unroll: 1) { | |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) | |
| } | |
| this.state = 0xFFFF_FFFF ^ s | |
| return nothing | |
| } | |
| // Load 128×4 = 512 bits from the first 64-byte chunk. | |
| x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) | |
| x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20]) | |
| x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30]) | |
| x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40]) | |
| // Combine with the initial state. | |
| x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s)) | |
| // Process the remaining 64-byte chunks. | |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16]) | |
| iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) { | |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10])) | |
| x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20])) | |
| x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30])) | |
| x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40])) | |
| } | |
| // Reduce 128×4 = 512 bits to 128 bits. | |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16]) | |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x0 = x0._mm_xor_si128(b: x1) | |
| x0 = x0._mm_xor_si128(b: y0) | |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x0 = x0._mm_xor_si128(b: x2) | |
| x0 = x0._mm_xor_si128(b: y0) | |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) | |
| x0 = x0._mm_xor_si128(b: x3) | |
| x0 = x0._mm_xor_si128(b: y0) | |
| // Reduce 128 bits to 64 bits. | |
| x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10) | |
| x2 = util.make_m128i_multiple_u32( | |
| a00: 0xFFFF_FFFF, | |
| a01: 0x0000_0000, | |
| a02: 0xFFFF_FFFF, | |
| a03: 0x0000_0000) | |
| x0 = x0._mm_srli_si128(imm8: 8) | |
| x0 = x0._mm_xor_si128(b: x1) | |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16]) | |
| x1 = x0._mm_srli_si128(imm8: 4) | |
| x0 = x0._mm_and_si128(b: x2) | |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_xor_si128(b: x1) | |
| // Reduce 64 bits to 32 bits and extract. | |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K6MU[.. 16]) | |
| x1 = x0._mm_and_si128(b: x2) | |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10) | |
| x1 = x1._mm_and_si128(b: x2) | |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) | |
| x0 = x0._mm_xor_si128(b: x1) | |
| s = x0._mm_extract_epi32(imm8: 1) | |
| // Handle the tail of args.x that wasn't a complete 64-byte chunk. | |
| tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64. | |
| if tail_index < args.x.length() { | |
| iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) { | |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) | |
| } | |
| } | |
| this.state = 0xFFFF_FFFF ^ s | |
| } | |
| // These constants come from page 22 of Gopal et al. They are also reproduced | |
| // by script/print-crc32-x86-sse42-magic-numbers.go which is runnable online at | |
| // https://play.golang.org/p/wH1q6GfhKOE | |
| // | |
| // (†) There is a discrepancy in the k6' constant. The Gopal paper gives the | |
| // least significant byte as 0x40 and this is confirmed by the reproduction | |
| // script. Indeed, the formula on page 22 ends with a left-shift by 1, so the | |
| // final hex digit must be even. | |
| // | |
| // Nonetheless, we use 0x41 here, the same as the Linux kernel | |
| // (https://github.com/torvalds/linux/blob/v5.11/arch/x86/crypto/crc32-pclmul_asm.S#L72). | |
| // It's there in 2021 unchanged since that code was introduced in 2013 | |
| // (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=78c37d191dd6899d8c219fee597a17d6e3c5d288). | |
| // | |
| // The Linux kernel's 0x41 (instead of 0x40) also turns up in e.g. Chromium | |
| // (https://source.chromium.org/chromium/chromium/src/+/master:third_party/zlib/crc32_simd.c;l=36;drc=9d4ec9349a1bf609eedb917c44c69eb0df9ff6bb) | |
| // and Go | |
| // (https://github.com/golang/go/blob/414fa8c35e7c2f65e2c767d6db2f25791e53b5c1/src/hash/crc32/crc32_amd64.s#L143). | |
| // The 0x41 has surely been copy/pasted to other software too. | |
| // | |
| // Code (0x41) and documentation (0x40) disagree. We went with code here, for | |
| // interoperability. Even though it's mathematical documentation, it's very | |
| // widely used code. | |
| // | |
| // For academic curiosity, there are at least two possible sources of the k6' | |
| // constant discrepancy. | |
| // | |
| // One is a transcription error. The Gopal paper has two consecutive lines: | |
| // k6' = ... = 0x1DB710640 | |
| // μ' = ... = 0x1F7011641 | |
| // The final hex digits are 640 and 641. | |
| // | |
| // Two is that there's also this line a little earlier on the same page: | |
| // P(x)' = 0x1DB710641 | |
| // This P(x)' number differs from k6' only in the final bit: 0x41 vs 0x40. This | |
| // isn't coincidental. Calculating k6' (in the Galois Field GF(2) space) means | |
| // dividing (i.e. bitwise XOR-ing) a 33-bit polynomial (x³², with only one 'on' | |
| // bit) by the 33-bit polynomial P(x)'. | |
| // | |
| // The discrepancy might not matter in practice because Barrett reduction is | |
| // already an integer approximation to a fraction. 0x40 vs 0x41 is essentially | |
| // rounding down vs up. The approximation diverges for large inputs but might | |
| // not matter for 'small' inputs that fit into e.g. a uint32 or uint64. The | |
| // tests seem to pass either way (and fail with 0x3F or 0x42). | |
| pri const IEEE_X86_SSE42_K1K2 : array[16] base.u8 = [ | |
| 0xD4, 0x2B, 0x44, 0x54, 0x01, 0x00, 0x00, 0x00, // k1' = 0x1_5444_2BD4 | |
| 0x96, 0x15, 0xE4, 0xC6, 0x01, 0x00, 0x00, 0x00, // k2' = 0x1_C6E4_1596 | |
| ] | |
| pri const IEEE_X86_SSE42_K3K4 : array[16] base.u8 = [ | |
| 0xD0, 0x97, 0x19, 0x75, 0x01, 0x00, 0x00, 0x00, // k3' = 0x1_7519_97D0 | |
| 0x9E, 0x00, 0xAA, 0xCC, 0x00, 0x00, 0x00, 0x00, // k4' = 0x0_CCAA_009E | |
| ] | |
| pri const IEEE_X86_SSE42_K5ZZ : array[16] base.u8 = [ | |
| 0x24, 0x61, 0xCD, 0x63, 0x01, 0x00, 0x00, 0x00, // k5' = 0x1_63CD_6124 | |
| 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Unused | |
| ] | |
| pri const IEEE_X86_SSE42_K6MU : array[16] base.u8 = [ | |
| 0x41, 0x06, 0x71, 0xDB, 0x01, 0x00, 0x00, 0x00, // k6' = 0x1_DB71_0640 (†) | |
| 0x41, 0x16, 0x01, 0xF7, 0x01, 0x00, 0x00, 0x00, // μ' = 0x1_F701_1641 | |
| ] |