From 5d0d49f8d06b80afa7ac957a58b3d7a810de53ad Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Thu, 25 Apr 2024 14:14:17 +0100 Subject: [PATCH 1/3] Refactor AdvSimd version of DecodeFromUTF8 --- .../src/System/Buffers/Text/Base64Decoder.cs | 147 +++++++++++------- 1 file changed, 90 insertions(+), 57 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index 16ff227379d1a..9c9f83219264d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -864,73 +864,106 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes // but make sure that we quit before seeing any == markers at the end of the // string. 64 + 2 = 66 bytes. - Vector128 decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); - Vector128 decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); - Vector128 decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(); - Vector128 decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte(); - Vector128 decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(); - Vector128 decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(); - Vector128 decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(); - Vector128 decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte(); - - Vector128 decOne1; - Vector128 decOne2; - Vector128 decOne3; - Vector128 decOne4; - Vector128 decTwo1; - Vector128 decTwo2; - Vector128 decTwo3; - Vector128 decTwo4; - Vector128 str1; - Vector128 str2; - Vector128 str3; - Vector128 str4; - Vector128 res1; - Vector128 res2; - Vector128 res3; + // In the decoding process, we want to map each byte, representing a Base64 value, to its 6-bit (0-63) representation. + // It uses the following mapping. Values outside the following groups are invalid and, we abort decoding when encounter one. + // + // # From To Char + // 1 [43] [62] + + // 2 [47] [63] / + // 3 [48..57] [52..61] 0..9 + // 4 [65..90] [0..25] A..Z + // 5 [97..122] [26..51] a..z + // + // To map an input value to its Base64 representation, we use look-up tables 'decLutOne' and 'decLutTwo'. + // 'decLutOne' helps to map groups 1, 2 and 3 while 'decLutTwo' maps groups 4 and 5 in the above list. + // After mapping, each value falls between 0-63. Consequently, the last six bits of each byte now hold a valid value. + // We then compress four such bytes (with valid 4 * 6 = 24 bits) to three UTF8 bytes (3 * 8 = 24 bits). + // For faster decoding, we use SIMD operations that allow the processing of multiple bytes together. + // However, the compress operation on adjacent values of a vector could be slower. Thus, we de-interleave while reading the input bytes that store adjacent + // bytes in separate vectors. This later simplifies the compress step with the help of logical operations. + // This requires interleaving while storing the decoded result. + + // Values in 'decLutOne' + // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 63 + // 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + var decLutOne = (Vector128.AllBitsSet, Vector128.AllBitsSet, + Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(), + Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte()); + + // Values in 'decLutTwo' + // 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255 + // 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 + // 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255 + // 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255 + var decLutTwo = (Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(), + Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(), + Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(), + Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte()); byte* src = srcBytes; byte* dest = destBytes; - Vector128 offset = AdvSimd.DuplicateToVector128((byte)0x3F); - var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4); - var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4); + Vector128 offset = Vector128.Create(63); do { - // Load 64 bytes and de-interleave. + // Step 1: Load 64 bytes and de-interleave. AssertRead>(src, srcStart, sourceLength); - (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); - - // Get indices for second LUT: - decTwo1 = AdvSimd.SubtractSaturate(str1, offset); - decTwo2 = AdvSimd.SubtractSaturate(str2, offset); - decTwo3 = AdvSimd.SubtractSaturate(str3, offset); - decTwo4 = AdvSimd.SubtractSaturate(str4, offset); - - // Get values from first LUT. Out-of-range indices are set to 0. - decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); - decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); - decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); - decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); - - // Get values from second LUT. Out-of-range indices are unchanged. + var (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); + + // Step 2: Map each valid input to its Base64 value. + // We use two look-ups to compute partial results and combine them later. + + // Step 2.1: Detect valid Base64 values from the first three groups. Maps input as, + // 0 to 63 (Invalid) => 255 + // 0 to 63 (Valid) => Their Base64 equivalent + // 64 to 255 => 0 + + // Each input value acts as an index in the look-up table 'decLutOne'. + // e.g., for group 1: index 43 maps to 62 (Base64 '+'). + // Group 4 and 5 values are out-of-range (>64), so they are mapped to zero. + // Other valid indices but invalid values are mapped to 255. + Vector128 decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); + Vector128 decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); + Vector128 decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); + Vector128 decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); + + // Step 2.2: Detect valid Base64 values from groups 4 and 5. Maps input as, + // 0 to 63 => 0 + // 64 to 122 (Valid) => Their Base64 equivalent + // 64 to 122 (Invalid) => 255 + // 123 to 255 => Remains unchanged + + // Subtract/offset each input value by 63 so that it can be used as a valid offset. + // Subtract saturate makes values from the first three groups set to zero that are + // then mapped to zero in the subsequent look-up. + Vector128 decTwo1 = AdvSimd.SubtractSaturate(str1, offset); + Vector128 decTwo2 = AdvSimd.SubtractSaturate(str2, offset); + Vector128 decTwo3 = AdvSimd.SubtractSaturate(str3, offset); + Vector128 decTwo4 = AdvSimd.SubtractSaturate(str4, offset); + + // We use VTBX to map values where out-of-range indices are unchanged. decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1); decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2); decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3); decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4); - // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. + // Step 3: Combine the partial result. + // Each look-up above maps valid values to their Base64 equivalent or zero. // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. - str1 = decOne1 | decTwo1; - str2 = decOne2 | decTwo2; - str3 = decOne3 | decTwo3; - str4 = decOne4 | decTwo4; + str1 = (decOne1 | decTwo1); + str2 = (decOne2 | decTwo2); + str3 = (decOne3 | decTwo3); + str4 = (decOne4 | decTwo4); + // Step 4: Detect an invalid input value. + // Invalid values < 122 are set to 255 while the ones above 122 are unchanged. // Check for invalid input, any value larger than 63. - Vector128 classified = AdvSimd.CompareGreaterThan(str1, offset) - | AdvSimd.CompareGreaterThan(str2, offset) - | AdvSimd.CompareGreaterThan(str3, offset) - | AdvSimd.CompareGreaterThan(str4, offset); + Vector128 classified = (Vector128.GreaterThan(str1, offset) + | Vector128.GreaterThan(str2, offset) + | Vector128.GreaterThan(str3, offset) + | Vector128.GreaterThan(str4, offset)); // Check that all bits are zero. if (classified != Vector128.Zero) @@ -938,12 +971,12 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes break; } - // Compress four bytes into three. - res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4); - res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2); - res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4; + // Step 5: Compress four bytes into three. + Vector128 res1 = ((str1 << 2) | (str2 >> 4)); + Vector128 res2 = ((str2 << 4) | (str3 >> 2)); + Vector128 res3 = ((str3 << 6) | str4); - // Interleave and store decoded result. + // Step 6: Interleave and store decoded results. AssertWrite>(dest, destStart, destLength); AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3)); From 89e95d2d576159a694a85c804b167decdff0463d Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Tue, 7 May 2024 11:10:00 +0100 Subject: [PATCH 2/3] Refactor look-up table for readability --- .../src/System/Buffers/Text/Base64Decoder.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index 9c9f83219264d..d81d978a2b10d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -888,7 +888,8 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 63 // 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 - var decLutOne = (Vector128.AllBitsSet, Vector128.AllBitsSet, + var decLutOne = (Vector128.AllBitsSet, + Vector128.AllBitsSet, Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(), Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte()); From 488b1d49d49d8d5515fc61796e347b525b54e34f Mon Sep 17 00:00:00 2001 From: Swapnil Gaikwad Date: Thu, 9 May 2024 11:44:25 +0100 Subject: [PATCH 3/3] Fix the comments --- .../src/System/Buffers/Text/Base64Decoder.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index d81d978a2b10d..777736e4a4deb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -879,25 +879,25 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes // After mapping, each value falls between 0-63. Consequently, the last six bits of each byte now hold a valid value. // We then compress four such bytes (with valid 4 * 6 = 24 bits) to three UTF8 bytes (3 * 8 = 24 bits). // For faster decoding, we use SIMD operations that allow the processing of multiple bytes together. - // However, the compress operation on adjacent values of a vector could be slower. Thus, we de-interleave while reading the input bytes that store adjacent - // bytes in separate vectors. This later simplifies the compress step with the help of logical operations. - // This requires interleaving while storing the decoded result. + // However, the compress operation on adjacent values of a vector could be slower. Thus, we de-interleave while reading + // the input bytes that store adjacent bytes in separate vectors. This later simplifies the compress step with the help + // of logical operations. This requires interleaving while storing the decoded result. - // Values in 'decLutOne' + // Values in 'decLutOne' maps input values from 0 to 63. // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 - // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 63 - // 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + // 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63 + // 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255 var decLutOne = (Vector128.AllBitsSet, Vector128.AllBitsSet, Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(), Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte()); - // Values in 'decLutTwo' + // Values in 'decLutTwo' maps input values from 63 to 127. + // 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 // 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255 // 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 // 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255 - // 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255 var decLutTwo = (Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(), Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(), Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(),