Vectorize Convert.ToBase64String using SSSE3 #21833
Changes from 20 commits
d218652
8217470
d0d89ca
1bf78f5
1c187ea
3fcdabf
6a5e3af
ed74c5b
02547ba
9b8c9d1
2ab1c5e
8acc598
df53ee9
d9204b4
36eb502
f29eab7
cd42c3c
72ea550
aef8747
a281f2a
1f80164
3239269
d772632
77207a2
55c7dac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
|
@@ -12,6 +12,8 @@ | |
using System.Security; | ||
using System.Diagnostics; | ||
using System.Diagnostics.CodeAnalysis; | ||
using System.Runtime.Intrinsics.X86; | ||
using System.Runtime.Intrinsics; | ||
|
||
namespace System | ||
{ | ||
|
@@ -2531,21 +2533,148 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan<byte> bytes, Span<char> | |
charsWritten = ConvertToBase64Array(outChars, inData, 0, bytes.Length, insertLineBreaks); | ||
return true; | ||
} | ||
} | ||
|
||
internal static readonly Vector128<byte> s_base64ShuffleMask = Vector128.Create((byte) | ||
1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); | ||
|
||
internal static readonly Vector128<byte> s_base64ShiftLut = Vector128.Create( | ||
(sbyte)'a' - 26, (sbyte)'0' - 52, | ||
(sbyte)'0' - 52, (sbyte)'0' - 52, | ||
(sbyte)'0' - 52, (sbyte)'0' - 52, | ||
(sbyte)'0' - 52, (sbyte)'0' - 52, | ||
(sbyte)'0' - 52, (sbyte)'0' - 52, | ||
(sbyte)'0' - 52, (sbyte)'+' - 62, | ||
(sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); | ||
|
||
internal static readonly Vector128<byte> s_base64TwoBytesStringMaskLo = Vector128.Create( | ||
0, 0x80, 1, 0x80, | ||
2, 0x80, 3, 0x80, | ||
4, 0x80, 5, 0x80, | ||
6, 0x80, 7, 0x80); | ||
|
||
// Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html (see THIRD-PARTY-NOTICES.txt) | ||
// The original code can be found here: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.sse.cpp (and lookup_pshufb_improved as a lookup function) | ||
private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
int i = offset, j = 0, charcount = 0; | ||
const int stride = 4 * 3; | ||
|
||
byte* outputBytes = (byte*)outChars; | ||
|
||
Vector128<byte> tt0 = Vector128.Create(0x0fc0fc00).AsByte(); | ||
Vector128<ushort> tt1 = Vector128.Create(0x04000040).AsUInt16(); | ||
Vector128<byte> tt2 = Vector128.Create(0x003f03f0).AsByte(); | ||
Vector128<ushort> tt3 = Vector128.Create(0x01000010).AsUInt16(); | ||
Vector128<byte> tt5 = Vector128.Create((byte)51); | ||
Vector128<sbyte> tt7 = Vector128.Create((sbyte)26); | ||
Vector128<byte> tt8 = Vector128.Create((byte)13); | ||
|
||
// static readonly Vector128 field + assigning its value to a local variable is a C# pattern for `const __mX` | ||
Vector128<byte> localShiftLut = s_base64ShiftLut; | ||
Vector128<byte> localShuffleMask = s_base64ShuffleMask; | ||
Vector128<byte> localTwoBytesStringMaskLo = s_base64TwoBytesStringMaskLo; | ||
|
||
for (; i <= length - stride; i += stride) | ||
{ | ||
// input = [xxxx|DDDC|CCBB|BAAA] | ||
Vector128<byte> inputVector = Sse2.LoadVector128(inData + i); | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// bytes from groups A, B and C are needed in separate 32-bit lanes | ||
// in = [DDDD|CCCC|BBBB|AAAA] | ||
// | ||
// an input triplet has layout | ||
// [????????|ccdddddd|bbbbcccc|aaaaaabb] | ||
// byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next triplet | ||
// | ||
// shuffling changes the order of bytes: 1, 0, 2, 1 | ||
// [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] | ||
// ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ | ||
// processed bits | ||
inputVector = Ssse3.Shuffle(inputVector, localShuffleMask); | ||
|
||
// unpacking | ||
|
||
// t0 = [0000cccc|cc000000|aaaaaa00|00000000] | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Vector128<byte> t0 = Sse2.And(inputVector, tt0); | ||
// t1 = [00000000|00cccccc|00000000|00aaaaaa] | ||
Vector128<byte> t1 = Sse2.MultiplyHigh(t0.AsUInt16(), tt1).AsByte(); | ||
// t2 = [00000000|00dddddd|000000bb|bbbb0000] | ||
Vector128<byte> t2 = Sse2.And(inputVector, tt2); | ||
// t3 = [00dddddd|00000000|00bbbbbb|00000000] | ||
Vector128<byte> t3 = Sse2.MultiplyLow(t2.AsUInt16(), tt3).AsByte(); | ||
// indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: the paper looks to differentiate between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure I follow, the comments are 100% copied from https://github.com/WojciechMula/base64simd/blob/master/encode/encode.sse.cpp#L20-L59 🙂 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Aren't we then missing the appropriate 3rd-party notice information, copying the relevant licensing information into this file, etc.? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated the THIRD-PARTY-NOTICES.txt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I was commenting that the paper, in some places, differentiates between uppercase For example, the paper calls out:
This looks to be particularly important for tracking which bits flow where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would still be nice to see this addressed. |
||
Vector128<byte> indices = Sse2.Or(t1, t3); | ||
|
||
// lookup function "Single pshufb method" (lookup_pshufb_improved) | ||
Vector128<byte> result = Sse2.SubtractSaturate(indices, tt5); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason this isn't a static local function (since it was a separate function in the original algorithm)? Inlining? |
||
Vector128<sbyte> compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); | ||
result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); | ||
result = Ssse3.Shuffle(localShiftLut, result); | ||
result = Sse2.Add(result, indices); | ||
// end of lookup function | ||
|
||
// save as two-bytes string, e.g.: | ||
// 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 | ||
Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, localTwoBytesStringMaskLo)); | ||
j += Vector128<byte>.Count; | ||
|
||
// Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) | ||
result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); | ||
result = Ssse3.Shuffle(result, localTwoBytesStringMaskLo); | ||
|
||
if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move the case with This may also prevent some spills in the simd-registers (if there are any). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't notice any noticeable performance regressions after I added this block for any values when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having the side effect only hit if It would be nice to move the |
||
{ | ||
// Normally we save 32 bytes per iteration | ||
// but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) | ||
// we need to insert `\r\n` in the middle of Vector128<byte> somehow | ||
// but the following code just saves a half of the vector, then appends `\r\n` manually | ||
// and the second part of the vector is ignored (this is why 'i' is decremented) | ||
charcount = 0; | ||
var shuffleResult = result.AsUInt64(); | ||
Sse2.StoreLow((ulong*)(outputBytes + j), shuffleResult); | ||
EgorBo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
j += Vector128<byte>.Count / 2; | ||
outputBytes[j++] = (byte)'\r'; | ||
outputBytes[j++] = 0; | ||
outputBytes[j++] = (byte)'\n'; | ||
outputBytes[j++] = 0; | ||
i -= stride / 4; | ||
} | ||
else | ||
{ | ||
Sse2.Store(outputBytes + j, result); | ||
j += Vector128<byte>.Count; | ||
} | ||
} | ||
// SIMD-based algorithm used `j` to count bytes, the software fallback uses it count chars | ||
j /= 2; | ||
|
||
return (i, j, charcount); | ||
} | ||
|
||
private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int offset, int length, bool insertLineBreaks) | ||
{ | ||
int charcount = 0; | ||
int i = offset; | ||
int j = 0; | ||
|
||
if (Ssse3.IsSupported && length - offset >= 36) | ||
{ | ||
// Tuple is faster then passing i,j,charcount by ref. | ||
// SSSE impl is moved to a separate method in order to avoid regression for smaller inputs | ||
(i, j, charcount) = ConvertToBase64ArraySsse3(outChars, inData, length, offset, insertLineBreaks); | ||
if (i == length) | ||
return j; | ||
} | ||
|
||
int lengthmod3 = length % 3; | ||
int calcLength = offset + (length - lengthmod3); | ||
int j = 0; | ||
int charcount = 0; | ||
//Convert three bytes at a time to base64 notation. This will consume 4 chars. | ||
int i; | ||
|
||
// get a pointer to the base64Table to avoid unnecessary range checking | ||
fixed (char* base64 = &base64Table[0]) | ||
{ | ||
for (i = offset; i < calcLength; i += 3) | ||
for (; i < calcLength; i += 3) | ||
{ | ||
if (insertLineBreaks) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A short comment describing each constant would be useful.
It's also not clear why these are static readonly, but several of the others (such as
tt0-tt8
) are notThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given https://github.com/dotnet/coreclr/issues/17225 and https://github.com/dotnet/coreclr/issues/26976, it would be more efficient processing and space-wise to use the
ROS<byte>
read-only property trick on these, especially since they're only used by code behind aSsse3.IsSupported
check.