From d218652502212c7c0403c28ce9ad99f07c182b53 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 6 Jan 2019 04:44:04 +0300 Subject: [PATCH 01/19] Vectorize Convert.ToBase64String --- .../System.Private.CoreLib.Shared.projitems | 1 + .../shared/System/Convert.Base64.Avx2.cs | 75 +++++++++++++++++++ .../shared/System/Convert.cs | 12 ++- 3 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 1308424cfb1f..0e33e93928e9 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -171,6 +171,7 @@ + diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs new file mode 100644 index 000000000000..bc7e11676074 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace System +{ + public static partial class Convert + { + private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) + { + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła + // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html + // Encode - https://github.com/WojciechMula/base64simd/blob/master/encode/encode.avx2.cpp + // Lookup - https://github.com/WojciechMula/base64simd/blob/master/encode/lookup.avx2.cpp (lookup_pshufb_improved) + + byte* outputCurrent = output; + for (int i = 0; i < inputLength; i += 2 * 4 * 3) + { + Vector128 lo = Sse2.LoadVector128(input + i); + Vector128 hi = Sse2.LoadVector128(input + i + 4 * 3); + + Vector256 @in = Avx2.Shuffle(Avx2.InsertVector128(lo.ToVector256(), hi, 1), s_shuffleMask); + + Vector256 t0 = Avx2.And(@in, Vector256.Create(0x0fc0fc00).AsByte()); + Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), Vector256.Create(0x04000040).AsUInt16()); + Vector256 t2 = Avx2.And(@in, Vector256.Create(0x003f03f0).AsByte()); + Vector256 t3 = Avx2.MultiplyLow(t2.AsUInt16(), Vector256.Create(0x01000010).AsUInt16()); + + Vector256 indices = Avx2.Or(t1, t3); + + Vector256 result = LookupPshufb(indices.AsByte()); + Avx.Store(outputCurrent, result); + outputCurrent += Vector256.Count; + } + + // Add padding ('=') + int lengthmod3 = (inputLength) % 3; + if (lengthmod3 == 1) + { + output[outputLength - 2] = 61; + output[outputLength - 1] = 61; + } + else if (lengthmod3 == 2) + { + output[outputLength - 1] = 61; + } + } + + private static Vector256 s_shuffleMask = Vector256.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, + 7, 6, 8, 7, 10, 9, 11, 10, + 1, 0, 2, 1, 4, 3, 5, 4, + 7, 6, 8, 7, 10, 9, 11, 10); + + private static Vector256 s_shiftLut = Vector256.Create( + 18229723555195321415UL, + 72503040736508UL, + 18229723555195321415UL, + 72503040736508UL); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LookupPshufb(Vector256 input) + { + Vector256 result = Avx2.SubtractSaturate(input, Vector256.Create((byte)51)); + Vector256 less = Avx2.CompareGreaterThan(Vector256.Create((sbyte)26), input.AsSByte()); + result = Avx2.Or(result, Avx2.And(less.AsByte(), Vector256.Create((byte)13))); + result = Avx2.Shuffle(s_shiftLut.AsByte(), result.AsByte()); + return Avx2.Add(result, input); + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 077dda9d4ff0..a5ed7a4a105f 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -11,6 +11,7 @@ using System.Runtime.Versioning; using System.Security; using System.Diagnostics; +using System.Runtime.Intrinsics.X86; namespace System { @@ -2428,8 +2429,15 @@ public static string ToBase64String(ReadOnlySpan bytes, Base64FormattingOp fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) fixed (char* charsPtr = result) { - int charsWritten = ConvertToBase64Array(charsPtr, bytesPtr, 0, bytes.Length, insertLineBreaks); - Debug.Assert(result.Length == charsWritten, $"Expected {result.Length} == {charsWritten}"); + if (bytes.Length >= 32 && Avx2.IsSupported) + { + EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length); + } + else + { + int charsWritten = ConvertToBase64Array(charsPtr, bytesPtr, 0, bytes.Length, insertLineBreaks); + Debug.Assert(result.Length == charsWritten, $"Expected {result.Length} == {charsWritten}"); + } } } From 8217470179ebb3bccc3765f3aebd92df752f4db1 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 6 Jan 2019 19:45:15 +0300 Subject: [PATCH 02/19] Fallback to ConvertToBase64Array for corner cases --- .../shared/System/Convert.Base64.Avx2.cs | 73 +++++++++++++------ .../shared/System/Convert.cs | 4 +- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs index bc7e11676074..0e2634ee5893 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -18,12 +18,29 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o // Lookup - https://github.com/WojciechMula/base64simd/blob/master/encode/lookup.avx2.cpp (lookup_pshufb_improved) byte* outputCurrent = output; - for (int i = 0; i < inputLength; i += 2 * 4 * 3) + int stride = 2 * 4 * 3; + + Vector256 shiftLut = Vector256.Create( + 18229723555195321415UL, 72503040736508UL, + 18229723555195321415UL, 72503040736508UL); + + Vector256 asciiToStringMask = Vector256.Create((byte) + 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, + 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80, + 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, + 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80); + + Vector256 shuffleMask = Vector256.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10, + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + + int i = 0; + for (; i < inputLength - stride - 1; i += stride) { Vector128 lo = Sse2.LoadVector128(input + i); - Vector128 hi = Sse2.LoadVector128(input + i + 4 * 3); + Vector128 hi = Sse2.LoadVector128(input + i + stride / 2); - Vector256 @in = Avx2.Shuffle(Avx2.InsertVector128(lo.ToVector256(), hi, 1), s_shuffleMask); + Vector256 @in = Avx2.Shuffle(Avx2.InsertVector128(lo.ToVector256(), hi, 1), shuffleMask); Vector256 t0 = Avx2.And(@in, Vector256.Create(0x0fc0fc00).AsByte()); Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), Vector256.Create(0x04000040).AsUInt16()); @@ -31,44 +48,52 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o Vector256 t3 = Avx2.MultiplyLow(t2.AsUInt16(), Vector256.Create(0x01000010).AsUInt16()); Vector256 indices = Avx2.Or(t1, t3); + Vector256 result = LookupPshufb(indices.AsByte(), shiftLut.AsByte()); - Vector256 result = LookupPshufb(indices.AsByte()); - Avx.Store(outputCurrent, result); - outputCurrent += Vector256.Count; + // Convert ASCII to 2-byte string + // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." + // I am not sure it's the most efficient way to do it: + + var resultLeft = Avx2.Shuffle(Avx2.Permute4x64(result.AsUInt64(), 0x10 /* _MM_SHUFFLE(0, 1, 0, 0) */).AsByte(), asciiToStringMask); + var resultRight = Avx2.Shuffle(Avx2.Permute4x64(result.AsUInt64(), 0x32 /* _MM_SHUFFLE(0, 3, 0, 2) */).AsByte(), asciiToStringMask); + + Avx.Store(outputCurrent, resultLeft); + Avx.Store(outputCurrent + Vector256.Count, resultRight); + outputCurrent += Vector256.Count * 2; } - // Add padding ('=') + // Handle cases when inputLength is not a multiple of 24 + if (i != inputLength) + { + ConvertToBase64Array((char*)outputCurrent, input, i, inputLength - i, false); + return; + } + + // or the result needs '='-paddings (when inputLength % 3 != 0) + // we can make it ConvertToBase64Array's responsibility too int lengthmod3 = (inputLength) % 3; if (lengthmod3 == 1) { + output[outputLength - 4] = 61; + output[outputLength - 3] = 0; output[outputLength - 2] = 61; - output[outputLength - 1] = 61; + output[outputLength - 1] = 0; } else if (lengthmod3 == 2) { - output[outputLength - 1] = 61; + output[outputLength - 2] = 61; + output[outputLength - 1] = 0; } } - private static Vector256 s_shuffleMask = Vector256.Create((byte) - 1, 0, 2, 1, 4, 3, 5, 4, - 7, 6, 8, 7, 10, 9, 11, 10, - 1, 0, 2, 1, 4, 3, 5, 4, - 7, 6, 8, 7, 10, 9, 11, 10); - - private static Vector256 s_shiftLut = Vector256.Create( - 18229723555195321415UL, - 72503040736508UL, - 18229723555195321415UL, - 72503040736508UL); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 LookupPshufb(Vector256 input) + private static Vector256 LookupPshufb(Vector256 input, Vector256 shiftLut) { Vector256 result = Avx2.SubtractSaturate(input, Vector256.Create((byte)51)); - Vector256 less = Avx2.CompareGreaterThan(Vector256.Create((sbyte)26), input.AsSByte()); + Vector256 less = Avx2.CompareGreaterThan(Vector256.Create((sbyte)26), input.AsSByte()); result = Avx2.Or(result, Avx2.And(less.AsByte(), Vector256.Create((byte)13))); - result = Avx2.Shuffle(s_shiftLut.AsByte(), result.AsByte()); + + result = Avx2.Shuffle(shiftLut, result.AsByte()); return Avx2.Add(result, input); } } diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index a5ed7a4a105f..9424856dba26 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2429,9 +2429,9 @@ public static string ToBase64String(ReadOnlySpan bytes, Base64FormattingOp fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) fixed (char* charsPtr = result) { - if (bytes.Length >= 32 && Avx2.IsSupported) + if (bytes.Length >= 48 && Avx2.IsSupported) { - EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length); + EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length * 2); } else { From d0d89caa6e24546c998c40cf05df231ae4030d11 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 6 Jan 2019 20:58:20 +0300 Subject: [PATCH 03/19] Only Base64FormattingOptions.None is supported so far --- .../shared/System/Convert.Base64.Avx2.cs | 30 ++++++++++++------- .../shared/System/Convert.cs | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs index 0e2634ee5893..44f00452649a 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -8,8 +8,25 @@ namespace System { - public static partial class Convert + public static partial class Convert2 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void StoreAsTwoByteString(Vector256 input, Vector256 asciiToStringMask, byte* output) + { + // Convert 1-byte string (input) to a 2-byte string + // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." + // I am not sure it's the most efficient way to do it: + + Vector256 permuteLeft = Avx2.Permute4x64(input.AsUInt64(), 0x10 /* _MM_SHUFFLE(0, 1, 0, 0) */).AsByte(); + Vector256 resultLeft = Avx2.Shuffle(permuteLeft, asciiToStringMask); + + Vector256 perfmuteRight = Avx2.Permute4x64(input.AsUInt64(), 0x32 /* _MM_SHUFFLE(0, 3, 0, 2) */).AsByte(); + Vector256 resultRight = Avx2.Shuffle(perfmuteRight, asciiToStringMask); + + Avx.Store(output, resultLeft); + Avx.Store(output + Vector256.Count, resultRight); + } + private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) { // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła @@ -50,15 +67,7 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o Vector256 indices = Avx2.Or(t1, t3); Vector256 result = LookupPshufb(indices.AsByte(), shiftLut.AsByte()); - // Convert ASCII to 2-byte string - // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." - // I am not sure it's the most efficient way to do it: - - var resultLeft = Avx2.Shuffle(Avx2.Permute4x64(result.AsUInt64(), 0x10 /* _MM_SHUFFLE(0, 1, 0, 0) */).AsByte(), asciiToStringMask); - var resultRight = Avx2.Shuffle(Avx2.Permute4x64(result.AsUInt64(), 0x32 /* _MM_SHUFFLE(0, 3, 0, 2) */).AsByte(), asciiToStringMask); - - Avx.Store(outputCurrent, resultLeft); - Avx.Store(outputCurrent + Vector256.Count, resultRight); + StoreAsTwoByteString(result, asciiToStringMask, outputCurrent); outputCurrent += Vector256.Count * 2; } @@ -92,7 +101,6 @@ private static Vector256 LookupPshufb(Vector256 input, Vector256 result = Avx2.SubtractSaturate(input, Vector256.Create((byte)51)); Vector256 less = Avx2.CompareGreaterThan(Vector256.Create((sbyte)26), input.AsSByte()); result = Avx2.Or(result, Avx2.And(less.AsByte(), Vector256.Create((byte)13))); - result = Avx2.Shuffle(shiftLut, result.AsByte()); return Avx2.Add(result, input); } diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 9424856dba26..4569d41f79a5 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2429,7 +2429,7 @@ public static string ToBase64String(ReadOnlySpan bytes, Base64FormattingOp fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) fixed (char* charsPtr = result) { - if (bytes.Length >= 48 && Avx2.IsSupported) + if (bytes.Length >= 48 && options == Base64FormattingOptions.None && Avx2.IsSupported) { EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length * 2); } From 1bf78f5cad6456ae4e3454492032a6a5d6517aec Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 6 Jan 2019 21:33:18 +0300 Subject: [PATCH 04/19] fix typo --- src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs index 44f00452649a..2386432b1ec5 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -8,7 +8,7 @@ namespace System { - public static partial class Convert2 + public static partial class Convert { [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void StoreAsTwoByteString(Vector256 input, Vector256 asciiToStringMask, byte* output) From 1c187eaf8854e831c4bfb190812c9917c1138a83 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 7 Jan 2019 02:09:00 +0300 Subject: [PATCH 05/19] Clean up --- .../shared/System/Convert.Base64.Avx2.cs | 67 ++++++++----------- .../shared/System/Convert.cs | 4 +- 2 files changed, 30 insertions(+), 41 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs index 2386432b1ec5..700401dd4c2b 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -10,23 +10,6 @@ namespace System { public static partial class Convert { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void StoreAsTwoByteString(Vector256 input, Vector256 asciiToStringMask, byte* output) - { - // Convert 1-byte string (input) to a 2-byte string - // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." - // I am not sure it's the most efficient way to do it: - - Vector256 permuteLeft = Avx2.Permute4x64(input.AsUInt64(), 0x10 /* _MM_SHUFFLE(0, 1, 0, 0) */).AsByte(); - Vector256 resultLeft = Avx2.Shuffle(permuteLeft, asciiToStringMask); - - Vector256 perfmuteRight = Avx2.Permute4x64(input.AsUInt64(), 0x32 /* _MM_SHUFFLE(0, 3, 0, 2) */).AsByte(); - Vector256 resultRight = Avx2.Shuffle(perfmuteRight, asciiToStringMask); - - Avx.Store(output, resultLeft); - Avx.Store(output + Vector256.Count, resultRight); - } - private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) { // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła @@ -37,11 +20,13 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o byte* outputCurrent = output; int stride = 2 * 4 * 3; - Vector256 shiftLut = Vector256.Create( - 18229723555195321415UL, 72503040736508UL, - 18229723555195321415UL, 72503040736508UL); + Vector256 shiftLut = Vector256.Create( + 0x47, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, + 0xFC, 0xFC, 0xFC, 0xED, 0xF0, 0x41, 0x00, 0x00, + 0x47, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, + 0xFC, 0xFC, 0xFC, 0xED, 0xF0, 0x41, 0x00, 0x00); - Vector256 asciiToStringMask = Vector256.Create((byte) + Vector256 asciiToStringMask = Vector256.Create( 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80, 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, @@ -52,6 +37,7 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); int i = 0; + for (; i < inputLength - stride - 1; i += stride) { Vector128 lo = Sse2.LoadVector128(input + i); @@ -65,33 +51,17 @@ private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* o Vector256 t3 = Avx2.MultiplyLow(t2.AsUInt16(), Vector256.Create(0x01000010).AsUInt16()); Vector256 indices = Avx2.Or(t1, t3); - Vector256 result = LookupPshufb(indices.AsByte(), shiftLut.AsByte()); + Vector256 result = LookupPshufb(indices.AsByte(), shiftLut); StoreAsTwoByteString(result, asciiToStringMask, outputCurrent); outputCurrent += Vector256.Count * 2; } // Handle cases when inputLength is not a multiple of 24 - if (i != inputLength) + // or it needs '='-paddings (when inputLength % 3 != 0) + if (i - (inputLength % 3) != inputLength) { ConvertToBase64Array((char*)outputCurrent, input, i, inputLength - i, false); - return; - } - - // or the result needs '='-paddings (when inputLength % 3 != 0) - // we can make it ConvertToBase64Array's responsibility too - int lengthmod3 = (inputLength) % 3; - if (lengthmod3 == 1) - { - output[outputLength - 4] = 61; - output[outputLength - 3] = 0; - output[outputLength - 2] = 61; - output[outputLength - 1] = 0; - } - else if (lengthmod3 == 2) - { - output[outputLength - 2] = 61; - output[outputLength - 1] = 0; } } @@ -104,5 +74,22 @@ private static Vector256 LookupPshufb(Vector256 input, Vector256 input, Vector256 asciiToStringMask, byte* output) + { + // Convert 1-byte string (input) to a 2-byte string + // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." + // I am not sure it's the most efficient way to do it: + + Vector256 permuteLeft = Avx2.Permute4x64(input.AsUInt64(), 0x10 /*_MM_SHUFFLE(0,1,0,0)*/).AsByte(); + Vector256 resultLeft = Avx2.Shuffle(permuteLeft, asciiToStringMask); + + Vector256 permuteRight = Avx2.Permute4x64(input.AsUInt64(), 0x32 /*_MM_SHUFFLE(0,3,0,2)*/).AsByte(); + Vector256 resultRight = Avx2.Shuffle(permuteRight, asciiToStringMask); + + Avx.Store(output, resultLeft); + Avx.Store(output + Vector256.Count, resultRight); + } } } diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 4569d41f79a5..1de4a5303234 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2429,7 +2429,9 @@ public static string ToBase64String(ReadOnlySpan bytes, Base64FormattingOp fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) fixed (char* charsPtr = result) { - if (bytes.Length >= 48 && options == Base64FormattingOptions.None && Avx2.IsSupported) + if (Avx2.IsSupported && + bytes.Length >= 50 && // see https://github.com/dotnet/coreclr/pull/21833 + options == Base64FormattingOptions.None) //InsertLineBreaks is not supported yet { EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length * 2); } From 3fcdabf8eccb82722a92f329182d2b1a3f563538 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 7 Jan 2019 18:05:13 +0300 Subject: [PATCH 06/19] Add initial SSSE3-based impl --- .../shared/System/Convert.Base64.Avx2.cs | 74 +++++++++++++++++-- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs index 700401dd4c2b..8821e7a213ca 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs @@ -10,13 +10,75 @@ namespace System { public static partial class Convert { - private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła + // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html + + + // SSSE3-based implementation + + private static unsafe void EncodeBase64Sse(byte* input, int inputLength, byte* output, int outputLength) { - // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła - // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html - // Encode - https://github.com/WojciechMula/base64simd/blob/master/encode/encode.avx2.cpp - // Lookup - https://github.com/WojciechMula/base64simd/blob/master/encode/lookup.avx2.cpp (lookup_pshufb_improved) + byte* outputCurrent = output; + int stride = 4 * 3; + + Vector128 shuffleMask = Vector128.Create((byte)1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + Vector128 shiftLut = Vector128.Create(18229723555195321415UL, 72503040736508UL).AsByte(); + Vector128 asciiToStringMaskLo = Vector128.Create(0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80); + Vector128 asciiToStringMaskHi = Vector128.Create(8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80, 12, 0x80, 13, 0x80, 14, 0x80, 15, 0x80); + + int i = 0; + for (; i < inputLength - stride - 1; i += stride) + { + Vector128 inputVector = Sse2.LoadVector128(input + i); + inputVector = Ssse3.Shuffle(inputVector, shuffleMask); + + Vector128 t0 = Sse2.And(inputVector, Vector128.Create(0x0fc0fc00).AsByte()); + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), Vector128.Create(0x04000040).AsUInt16()).AsByte(); + Vector128 t2 = Sse2.And(inputVector, Vector128.Create(0x003f03f0).AsByte()); + Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), Vector128.Create(0x01000010).AsUInt16()).AsByte(); + Vector128 indices = Sse2.Or(t1, t3); + + Vector128 result = Sse2.SubtractSaturate(indices, Vector128.Create((byte)51)); + Vector128 compareResult = Sse2.CompareGreaterThan(Vector128.Create((sbyte)26), indices.AsSByte()); + result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), Vector128.Create((byte)13))); + result = Ssse3.Shuffle(shiftLut, result); + result = Sse2.Add(result, indices); + + // save as two-bytes string + Sse2.Store(outputCurrent, Ssse3.Shuffle(result, asciiToStringMaskLo)); + outputCurrent += Vector128.Count; + Sse2.Store(outputCurrent, Ssse3.Shuffle(result, asciiToStringMaskHi)); + outputCurrent += Vector128.Count; + } + + // Handle cases when inputLength is not a multiple of 24 + if (i != inputLength) + { + ConvertToBase64Array((char*)outputCurrent, input, i, inputLength - i, false); + return; + } + + // or the result needs '='-paddings (when inputLength % 3 != 0) + int lengthmod3 = (inputLength) % 3; + if (lengthmod3 == 1) + { + output[outputLength - 4] = 61; + output[outputLength - 3] = 0; + output[outputLength - 2] = 61; + output[outputLength - 1] = 0; + } + else if (lengthmod3 == 2) + { + output[outputLength - 2] = 61; + output[outputLength - 1] = 0; + } + } + + // AVX2-based implementation + + private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) + { byte* outputCurrent = output; int stride = 2 * 4 * 3; @@ -91,5 +153,7 @@ private static unsafe void StoreAsTwoByteString(Vector256 input, Vector256 Avx.Store(output, resultLeft); Avx.Store(output + Vector256.Count, resultRight); } + + } } From ed74c5b8b2da0b05f8fba09f6849f94592b42642 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 9 Feb 2019 17:36:03 +0300 Subject: [PATCH 07/19] Merge SSSE3-based impl with ConvertToBase64Array --- .../System.Private.CoreLib.Shared.projitems | 1 - .../shared/System/Convert.Base64.Avx2.cs | 159 ------------------ .../shared/System/Convert.cs | 103 +++++++++++- 3 files changed, 98 insertions(+), 165 deletions(-) delete mode 100644 src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index e26387da8c2e..b113504b3d9f 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -174,7 +174,6 @@ - diff --git a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs b/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs deleted file mode 100644 index 8821e7a213ca..000000000000 --- a/src/System.Private.CoreLib/shared/System/Convert.Base64.Avx2.cs +++ /dev/null @@ -1,159 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Runtime.CompilerServices; -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; - -namespace System -{ - public static partial class Convert - { - // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła - // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html - - - // SSSE3-based implementation - - private static unsafe void EncodeBase64Sse(byte* input, int inputLength, byte* output, int outputLength) - { - byte* outputCurrent = output; - int stride = 4 * 3; - - Vector128 shuffleMask = Vector128.Create((byte)1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); - Vector128 shiftLut = Vector128.Create(18229723555195321415UL, 72503040736508UL).AsByte(); - Vector128 asciiToStringMaskLo = Vector128.Create(0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80); - Vector128 asciiToStringMaskHi = Vector128.Create(8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80, 12, 0x80, 13, 0x80, 14, 0x80, 15, 0x80); - - int i = 0; - for (; i < inputLength - stride - 1; i += stride) - { - Vector128 inputVector = Sse2.LoadVector128(input + i); - inputVector = Ssse3.Shuffle(inputVector, shuffleMask); - - Vector128 t0 = Sse2.And(inputVector, Vector128.Create(0x0fc0fc00).AsByte()); - Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), Vector128.Create(0x04000040).AsUInt16()).AsByte(); - Vector128 t2 = Sse2.And(inputVector, Vector128.Create(0x003f03f0).AsByte()); - Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), Vector128.Create(0x01000010).AsUInt16()).AsByte(); - Vector128 indices = Sse2.Or(t1, t3); - - Vector128 result = Sse2.SubtractSaturate(indices, Vector128.Create((byte)51)); - Vector128 compareResult = Sse2.CompareGreaterThan(Vector128.Create((sbyte)26), indices.AsSByte()); - result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), Vector128.Create((byte)13))); - result = Ssse3.Shuffle(shiftLut, result); - result = Sse2.Add(result, indices); - - // save as two-bytes string - Sse2.Store(outputCurrent, Ssse3.Shuffle(result, asciiToStringMaskLo)); - outputCurrent += Vector128.Count; - Sse2.Store(outputCurrent, Ssse3.Shuffle(result, asciiToStringMaskHi)); - outputCurrent += Vector128.Count; - } - - // Handle cases when inputLength is not a multiple of 24 - if (i != inputLength) - { - ConvertToBase64Array((char*)outputCurrent, input, i, inputLength - i, false); - return; - } - - // or the result needs '='-paddings (when inputLength % 3 != 0) - int lengthmod3 = (inputLength) % 3; - if (lengthmod3 == 1) - { - output[outputLength - 4] = 61; - output[outputLength - 3] = 0; - output[outputLength - 2] = 61; - output[outputLength - 1] = 0; - } - else if (lengthmod3 == 2) - { - output[outputLength - 2] = 61; - output[outputLength - 1] = 0; - } - } - - - // AVX2-based implementation - - private static unsafe void EncodeBase64Avx(byte* input, int inputLength, byte* output, int outputLength) - { - byte* outputCurrent = output; - int stride = 2 * 4 * 3; - - Vector256 shiftLut = Vector256.Create( - 0x47, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, - 0xFC, 0xFC, 0xFC, 0xED, 0xF0, 0x41, 0x00, 0x00, - 0x47, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, - 0xFC, 0xFC, 0xFC, 0xED, 0xF0, 0x41, 0x00, 0x00); - - Vector256 asciiToStringMask = Vector256.Create( - 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, - 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80, - 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, - 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80); - - Vector256 shuffleMask = Vector256.Create((byte) - 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10, - 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); - - int i = 0; - - for (; i < inputLength - stride - 1; i += stride) - { - Vector128 lo = Sse2.LoadVector128(input + i); - Vector128 hi = Sse2.LoadVector128(input + i + stride / 2); - - Vector256 @in = Avx2.Shuffle(Avx2.InsertVector128(lo.ToVector256(), hi, 1), shuffleMask); - - Vector256 t0 = Avx2.And(@in, Vector256.Create(0x0fc0fc00).AsByte()); - Vector256 t1 = Avx2.MultiplyHigh(t0.AsUInt16(), Vector256.Create(0x04000040).AsUInt16()); - Vector256 t2 = Avx2.And(@in, Vector256.Create(0x003f03f0).AsByte()); - Vector256 t3 = Avx2.MultiplyLow(t2.AsUInt16(), Vector256.Create(0x01000010).AsUInt16()); - - Vector256 indices = Avx2.Or(t1, t3); - Vector256 result = LookupPshufb(indices.AsByte(), shiftLut); - - StoreAsTwoByteString(result, asciiToStringMask, outputCurrent); - outputCurrent += Vector256.Count * 2; - } - - // Handle cases when inputLength is not a multiple of 24 - // or it needs '='-paddings (when inputLength % 3 != 0) - if (i - (inputLength % 3) != inputLength) - { - ConvertToBase64Array((char*)outputCurrent, input, i, inputLength - i, false); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 LookupPshufb(Vector256 input, Vector256 shiftLut) - { - Vector256 result = Avx2.SubtractSaturate(input, Vector256.Create((byte)51)); - Vector256 less = Avx2.CompareGreaterThan(Vector256.Create((sbyte)26), input.AsSByte()); - result = Avx2.Or(result, Avx2.And(less.AsByte(), Vector256.Create((byte)13))); - result = Avx2.Shuffle(shiftLut, result.AsByte()); - return Avx2.Add(result, input); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe void StoreAsTwoByteString(Vector256 input, Vector256 asciiToStringMask, byte* output) - { - // Convert 1-byte string (input) to a 2-byte string - // E.g. "1,2,3,4,5..." to "1,0,2,0,3,0,4,0,5,0..." - // I am not sure it's the most efficient way to do it: - - Vector256 permuteLeft = Avx2.Permute4x64(input.AsUInt64(), 0x10 /*_MM_SHUFFLE(0,1,0,0)*/).AsByte(); - Vector256 resultLeft = Avx2.Shuffle(permuteLeft, asciiToStringMask); - - Vector256 permuteRight = Avx2.Permute4x64(input.AsUInt64(), 0x32 /*_MM_SHUFFLE(0,3,0,2)*/).AsByte(); - Vector256 resultRight = Avx2.Shuffle(permuteRight, asciiToStringMask); - - Avx.Store(output, resultLeft); - Avx.Store(output + Vector256.Count, resultRight); - } - - - } -} diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index a5b4ac449f05..b50d491fb621 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -12,6 +12,7 @@ using System.Security; using System.Diagnostics; using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; namespace System { @@ -2532,17 +2533,109 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int offset, int length, bool insertLineBreaks) { + int charcount = 0; + int i = offset; + int j = 0; + const int stride = 4 * 3; + + if (Ssse3.IsSupported && length > stride) + { + byte* outputBytes = (byte*)outChars; + + Vector128 shuffleMask = Vector128.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + Vector128 shiftLut = Vector128.Create( + (sbyte)'a' - 26, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'+' - 62, + (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); + + Vector128 twoBytesStringMaskLo = Vector128.Create( + 0, 0x80, 1, 0x80, + 2, 0x80, 3, 0x80, + 4, 0x80, 5, 0x80, + 6, 0x80, 7, 0x80); + + Vector128 tt0 = Vector128.Create(0x0fc0fc00).AsByte(); + Vector128 tt1 = Vector128.Create(0x04000040).AsUInt16(); + Vector128 tt2 = Vector128.Create(0x003f03f0).AsByte(); + Vector128 tt3 = Vector128.Create(0x01000010).AsUInt16(); + Vector128 tt5 = Vector128.Create((byte)51); + Vector128 tt7 = Vector128.Create((sbyte)26); + Vector128 tt8 = Vector128.Create((byte)13); + + for (; i <= length - stride; i += stride) + { + Vector128 inputVector = Sse2.LoadVector128(inData + i); + inputVector = Ssse3.Shuffle(inputVector, shuffleMask); + + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] + Vector128 t0 = Sse2.And(inputVector, tt0); + // t1 = [00000000|00cccccc|00000000|00aaaaaa] + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), tt1).AsByte(); + // t2 = [00000000|00dddddd|000000bb|bbbb0000] + Vector128 t2 = Sse2.And(inputVector, tt2); + // t3 = [00dddddd|00000000|00bbbbbb|00000000] + Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), tt3).AsByte(); + // indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 + Vector128 indices = Sse2.Or(t1, t3); + + // lookup function + Vector128 result = Sse2.SubtractSaturate(indices, tt5); + Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); + result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); + result = Ssse3.Shuffle(shiftLut, result); + result = Sse2.Add(result, indices); + + // save as two-bytes string, e.g.: + // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 + Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, twoBytesStringMaskLo)); + j += Vector128.Count; + + // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) + result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); + result = Ssse3.Shuffle(result, twoBytesStringMaskLo); + + if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) + { + // Normally we save 32 bytes per iteration + // but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) + // we need to insert `\r\n` in the middle of Vector128 somehow + // but the following code just saves a half of the vector, then appends `\r\n` manually + // and the second part of the vector is ignored (this is why 'i' is decremented) + charcount = 0; + var shuffleResult = result.AsUInt64(); + Sse2.StoreLow((ulong*)(outputBytes + j), shuffleResult); + j += Vector128.Count / 2; + outputBytes[j++] = (byte)'\r'; + outputBytes[j++] = 0; + outputBytes[j++] = (byte)'\n'; + outputBytes[j++] = 0; + i -= stride / 4; + } + else + { + Sse2.Store(outputBytes + j, result); + j += Vector128.Count; + } + } + // SIMD-based algorithm used `j` to count bytes, the software fallback uses it count chars + j /= 2; + + if (i == length) + return j; + } int lengthmod3 = length % 3; int calcLength = offset + (length - lengthmod3); - int j = 0; - int charcount = 0; //Convert three bytes at a time to base64 notation. This will consume 4 chars. - int i; // get a pointer to the base64Table to avoid unnecessary range checking fixed (char* base64 = &base64Table[0]) { - for (i = offset; i < calcLength; i += 3) + for (; i < calcLength; i += 3) { if (insertLineBreaks) { From 02547ba64c04437c522c58d513839d9eacbb4498 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 9 Feb 2019 17:41:09 +0300 Subject: [PATCH 08/19] remove avx --- src/System.Private.CoreLib/shared/System/Convert.cs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index b50d491fb621..e9c4ee0752e6 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2424,17 +2424,8 @@ public static string ToBase64String(ReadOnlySpan bytes, Base64FormattingOp fixed (byte* bytesPtr = &MemoryMarshal.GetReference(bytes)) fixed (char* charsPtr = result) { - if (Avx2.IsSupported && - bytes.Length >= 50 && // see https://github.com/dotnet/coreclr/pull/21833 - options == Base64FormattingOptions.None) //InsertLineBreaks is not supported yet - { - EncodeBase64Avx(bytesPtr, bytes.Length, (byte*)charsPtr, result.Length * 2); - } - else - { - int charsWritten = ConvertToBase64Array(charsPtr, bytesPtr, 0, bytes.Length, insertLineBreaks); - Debug.Assert(result.Length == charsWritten, $"Expected {result.Length} == {charsWritten}"); - } + int charsWritten = ConvertToBase64Array(charsPtr, bytesPtr, 0, bytes.Length, insertLineBreaks); + Debug.Assert(result.Length == charsWritten, $"Expected {result.Length} == {charsWritten}"); } } From 9b8c9d10434b5f4983661e7ab795035bc5f5e4d1 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 9 Feb 2019 19:23:19 +0300 Subject: [PATCH 09/19] Add copy-right and use SSSE3 when inputLength >= 36 --- src/System.Private.CoreLib/shared/System/Convert.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index e9c4ee0752e6..c6007137219a 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2529,12 +2529,15 @@ private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int int j = 0; const int stride = 4 * 3; - if (Ssse3.IsSupported && length > stride) + if (Ssse3.IsSupported && length > stride * 3) { + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła + // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html byte* outputBytes = (byte*)outChars; Vector128 shuffleMask = Vector128.Create((byte) 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + Vector128 shiftLut = Vector128.Create( (sbyte)'a' - 26, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, From 2ab1c5e58d62595bf08f3b50ce80e073c3d64b28 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 9 Feb 2019 23:45:06 +0300 Subject: [PATCH 10/19] move to a separate method, also move constant vectors --- .../shared/System/Convert.cs | 191 +++++++++--------- 1 file changed, 101 insertions(+), 90 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index c6007137219a..9738cec759a7 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2520,108 +2520,119 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span charsWritten = ConvertToBase64Array(outChars, inData, 0, bytes.Length, insertLineBreaks); return true; } - } + } - private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int offset, int length, bool insertLineBreaks) + internal static readonly Vector128 base64ShuffleMask = Vector128.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + + internal static readonly Vector128 base64ShiftLut = Vector128.Create( + (sbyte)'a' - 26, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'0' - 52, + (sbyte)'0' - 52, (sbyte)'+' - 62, + (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); + + internal static readonly Vector128 base64TwoBytesStringMaskLo = Vector128.Create( + 0, 0x80, 1, 0x80, + 2, 0x80, 3, 0x80, + 4, 0x80, 5, 0x80, + 6, 0x80, 7, 0x80); + + private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) { - int charcount = 0; - int i = offset; - int j = 0; + int i = offset, j = 0, charcount = 0; const int stride = 4 * 3; - if (Ssse3.IsSupported && length > stride * 3) + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła + // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html + byte* outputBytes = (byte*)outChars; + + Vector128 tt0 = Vector128.Create(0x0fc0fc00).AsByte(); + Vector128 tt1 = Vector128.Create(0x04000040).AsUInt16(); + Vector128 tt2 = Vector128.Create(0x003f03f0).AsByte(); + Vector128 tt3 = Vector128.Create(0x01000010).AsUInt16(); + Vector128 tt5 = Vector128.Create((byte)51); + Vector128 tt7 = Vector128.Create((sbyte)26); + Vector128 tt8 = Vector128.Create((byte)13); + + for (; i <= length - stride; i += stride) { - // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła - // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html - byte* outputBytes = (byte*)outChars; - - Vector128 shuffleMask = Vector128.Create((byte) - 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); - - Vector128 shiftLut = Vector128.Create( - (sbyte)'a' - 26, (sbyte)'0' - 52, - (sbyte)'0' - 52, (sbyte)'0' - 52, - (sbyte)'0' - 52, (sbyte)'0' - 52, - (sbyte)'0' - 52, (sbyte)'0' - 52, - (sbyte)'0' - 52, (sbyte)'0' - 52, - (sbyte)'0' - 52, (sbyte)'+' - 62, - (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); - - Vector128 twoBytesStringMaskLo = Vector128.Create( - 0, 0x80, 1, 0x80, - 2, 0x80, 3, 0x80, - 4, 0x80, 5, 0x80, - 6, 0x80, 7, 0x80); - - Vector128 tt0 = Vector128.Create(0x0fc0fc00).AsByte(); - Vector128 tt1 = Vector128.Create(0x04000040).AsUInt16(); - Vector128 tt2 = Vector128.Create(0x003f03f0).AsByte(); - Vector128 tt3 = Vector128.Create(0x01000010).AsUInt16(); - Vector128 tt5 = Vector128.Create((byte)51); - Vector128 tt7 = Vector128.Create((sbyte)26); - Vector128 tt8 = Vector128.Create((byte)13); - - for (; i <= length - stride; i += stride) + Vector128 inputVector = Sse2.LoadVector128(inData + i); + inputVector = Ssse3.Shuffle(inputVector, base64ShuffleMask); + + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] + Vector128 t0 = Sse2.And(inputVector, tt0); + // t1 = [00000000|00cccccc|00000000|00aaaaaa] + Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), tt1).AsByte(); + // t2 = [00000000|00dddddd|000000bb|bbbb0000] + Vector128 t2 = Sse2.And(inputVector, tt2); + // t3 = [00dddddd|00000000|00bbbbbb|00000000] + Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), tt3).AsByte(); + // indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 + Vector128 indices = Sse2.Or(t1, t3); + + // lookup function + Vector128 result = Sse2.SubtractSaturate(indices, tt5); + Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); + result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); + result = Ssse3.Shuffle(base64ShiftLut, result); + result = Sse2.Add(result, indices); + + // save as two-bytes string, e.g.: + // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 + Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, base64TwoBytesStringMaskLo)); + j += Vector128.Count; + + // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) + result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); + result = Ssse3.Shuffle(result, base64TwoBytesStringMaskLo); + + if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) + { + // Normally we save 32 bytes per iteration + // but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) + // we need to insert `\r\n` in the middle of Vector128 somehow + // but the following code just saves a half of the vector, then appends `\r\n` manually + // and the second part of the vector is ignored (this is why 'i' is decremented) + charcount = 0; + var shuffleResult = result.AsUInt64(); + Sse2.StoreLow((ulong*)(outputBytes + j), shuffleResult); + j += Vector128.Count / 2; + outputBytes[j++] = (byte)'\r'; + outputBytes[j++] = 0; + outputBytes[j++] = (byte)'\n'; + outputBytes[j++] = 0; + i -= stride / 4; + } + else { - Vector128 inputVector = Sse2.LoadVector128(inData + i); - inputVector = Ssse3.Shuffle(inputVector, shuffleMask); - - // t0 = [0000cccc|cc000000|aaaaaa00|00000000] - Vector128 t0 = Sse2.And(inputVector, tt0); - // t1 = [00000000|00cccccc|00000000|00aaaaaa] - Vector128 t1 = Sse2.MultiplyHigh(t0.AsUInt16(), tt1).AsByte(); - // t2 = [00000000|00dddddd|000000bb|bbbb0000] - Vector128 t2 = Sse2.And(inputVector, tt2); - // t3 = [00dddddd|00000000|00bbbbbb|00000000] - Vector128 t3 = Sse2.MultiplyLow(t2.AsUInt16(), tt3).AsByte(); - // indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 - Vector128 indices = Sse2.Or(t1, t3); - - // lookup function - Vector128 result = Sse2.SubtractSaturate(indices, tt5); - Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); - result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); - result = Ssse3.Shuffle(shiftLut, result); - result = Sse2.Add(result, indices); - - // save as two-bytes string, e.g.: - // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 - Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, twoBytesStringMaskLo)); + Sse2.Store(outputBytes + j, result); j += Vector128.Count; - - // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) - result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); - result = Ssse3.Shuffle(result, twoBytesStringMaskLo); - - if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) - { - // Normally we save 32 bytes per iteration - // but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) - // we need to insert `\r\n` in the middle of Vector128 somehow - // but the following code just saves a half of the vector, then appends `\r\n` manually - // and the second part of the vector is ignored (this is why 'i' is decremented) - charcount = 0; - var shuffleResult = result.AsUInt64(); - Sse2.StoreLow((ulong*)(outputBytes + j), shuffleResult); - j += Vector128.Count / 2; - outputBytes[j++] = (byte)'\r'; - outputBytes[j++] = 0; - outputBytes[j++] = (byte)'\n'; - outputBytes[j++] = 0; - i -= stride / 4; - } - else - { - Sse2.Store(outputBytes + j, result); - j += Vector128.Count; - } } - // SIMD-based algorithm used `j` to count bytes, the software fallback uses it count chars - j /= 2; + } + // SIMD-based algorithm used `j` to count bytes, the software fallback uses it count chars + j /= 2; + + return (i, j, charcount); + } + private static unsafe int ConvertToBase64Array(char* outChars, byte* inData, int offset, int length, bool insertLineBreaks) + { + int charcount = 0; + int i = offset; + int j = 0; + + if (Ssse3.IsSupported && length - offset >= 36) + { + // Tuple is faster then passing i,j,charcount by ref. + // SSSE impl is moved to a separate method in order to avoid regression for smaller inputs + (i, j, charcount) = ConvertToBase64ArraySsse3(outChars, inData, length, offset, insertLineBreaks); if (i == length) return j; } + int lengthmod3 = length % 3; int calcLength = offset + (length - lengthmod3); //Convert three bytes at a time to base64 notation. This will consume 4 chars. From 8acc5984dbf91f2111cd25672b107e18abc50c93 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 10 Feb 2019 13:24:21 +0300 Subject: [PATCH 11/19] rename static readonly fields (add s_ prefix) --- .../shared/System/Convert.cs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 9738cec759a7..14bc5837e168 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2522,23 +2522,23 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span } } - internal static readonly Vector128 base64ShuffleMask = Vector128.Create((byte) - 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); + internal static readonly Vector128 s_base64ShuffleMask = Ssse3.IsSupported ? Vector128.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10) : default; - internal static readonly Vector128 base64ShiftLut = Vector128.Create( + internal static readonly Vector128 s_base64ShiftLut = Ssse3.IsSupported ? Vector128.Create( (sbyte)'a' - 26, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'+' - 62, - (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); + (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte() : default; - internal static readonly Vector128 base64TwoBytesStringMaskLo = Vector128.Create( + internal static readonly Vector128 s_base64TwoBytesStringMaskLo = Ssse3.IsSupported ? Vector128.Create( 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, - 6, 0x80, 7, 0x80); + 6, 0x80, 7, 0x80) : default; private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) { @@ -2560,7 +2560,7 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch for (; i <= length - stride; i += stride) { Vector128 inputVector = Sse2.LoadVector128(inData + i); - inputVector = Ssse3.Shuffle(inputVector, base64ShuffleMask); + inputVector = Ssse3.Shuffle(inputVector, s_base64ShuffleMask); // t0 = [0000cccc|cc000000|aaaaaa00|00000000] Vector128 t0 = Sse2.And(inputVector, tt0); @@ -2577,17 +2577,17 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch Vector128 result = Sse2.SubtractSaturate(indices, tt5); Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); - result = Ssse3.Shuffle(base64ShiftLut, result); + result = Ssse3.Shuffle(s_base64ShiftLut, result); result = Sse2.Add(result, indices); // save as two-bytes string, e.g.: // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 - Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, base64TwoBytesStringMaskLo)); + Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, s_base64TwoBytesStringMaskLo)); j += Vector128.Count; // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); - result = Ssse3.Shuffle(result, base64TwoBytesStringMaskLo); + result = Ssse3.Shuffle(result, s_base64TwoBytesStringMaskLo); if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) { From df53ee94fb4dbb7343f4a6fc96d10946d70721ed Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sun, 10 Feb 2019 14:03:17 +0300 Subject: [PATCH 12/19] remove Ssse3.IsSupported from static readonly vectors --- src/System.Private.CoreLib/shared/System/Convert.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 14bc5837e168..7ef8e38949ce 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2522,23 +2522,23 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span } } - internal static readonly Vector128 s_base64ShuffleMask = Ssse3.IsSupported ? Vector128.Create((byte) - 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10) : default; + internal static readonly Vector128 s_base64ShuffleMask = Vector128.Create((byte) + 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); - internal static readonly Vector128 s_base64ShiftLut = Ssse3.IsSupported ? Vector128.Create( + internal static readonly Vector128 s_base64ShiftLut = Vector128.Create( (sbyte)'a' - 26, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'0' - 52, (sbyte)'+' - 62, - (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte() : default; + (sbyte)'/' - 63, (sbyte)'A', 0, 0).AsByte(); - internal static readonly Vector128 s_base64TwoBytesStringMaskLo = Ssse3.IsSupported ? Vector128.Create( + internal static readonly Vector128 s_base64TwoBytesStringMaskLo = Vector128.Create( 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, - 6, 0x80, 7, 0x80) : default; + 6, 0x80, 7, 0x80); private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) { From 36eb502973772549bce4031252bd7d907a2fd8b8 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 21 Feb 2019 16:56:12 +0300 Subject: [PATCH 13/19] copy static readonly vectors to local variables to keep them in registers --- src/System.Private.CoreLib/shared/System/Convert.cs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 7ef8e38949ce..fd9c8ee806af 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2557,10 +2557,14 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch Vector128 tt7 = Vector128.Create((sbyte)26); Vector128 tt8 = Vector128.Create((byte)13); + Vector128 localShiftLut = s_base64ShiftLut; + Vector128 localShuffleMask = s_base64ShuffleMask; + Vector128 localTwoBytesStringMaskLo = s_base64TwoBytesStringMaskLo; + for (; i <= length - stride; i += stride) { Vector128 inputVector = Sse2.LoadVector128(inData + i); - inputVector = Ssse3.Shuffle(inputVector, s_base64ShuffleMask); + inputVector = Ssse3.Shuffle(inputVector, localShuffleMask); // t0 = [0000cccc|cc000000|aaaaaa00|00000000] Vector128 t0 = Sse2.And(inputVector, tt0); @@ -2577,17 +2581,17 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch Vector128 result = Sse2.SubtractSaturate(indices, tt5); Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); - result = Ssse3.Shuffle(s_base64ShiftLut, result); + result = Ssse3.Shuffle(localShiftLut, result); result = Sse2.Add(result, indices); // save as two-bytes string, e.g.: // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 - Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, s_base64TwoBytesStringMaskLo)); + Sse2.Store(outputBytes + j, Ssse3.Shuffle(result, localTwoBytesStringMaskLo)); j += Vector128.Count; // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); - result = Ssse3.Shuffle(result, s_base64TwoBytesStringMaskLo); + result = Ssse3.Shuffle(result, localTwoBytesStringMaskLo); if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) { From cd42c3c12c3544e4352cc54302ee9d5db6d75e24 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 24 Apr 2019 16:12:02 +0300 Subject: [PATCH 14/19] Add more comments --- .../shared/System/Convert.cs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 203eddc987b4..cd15b89cfdc8 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2558,15 +2558,31 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch Vector128 tt7 = Vector128.Create((sbyte)26); Vector128 tt8 = Vector128.Create((byte)13); + // static readonly Vector128 field + assigning its value to a local variable is a C# pattern for `const __mX` Vector128 localShiftLut = s_base64ShiftLut; Vector128 localShuffleMask = s_base64ShuffleMask; Vector128 localTwoBytesStringMaskLo = s_base64TwoBytesStringMaskLo; for (; i <= length - stride; i += stride) { + // input = [xxxx|DDDC|CCBB|BAAA] Vector128 inputVector = Sse2.LoadVector128(inData + i); + + // bytes from groups A, B and C are needed in separate 32-bit lanes + // in = [DDDD|CCCC|BBBB|AAAA] + // + // an input triplet has layout + // [????????|ccdddddd|bbbbcccc|aaaaaabb] + // byte 3 byte 2 byte 1 byte 0 -- byte 3 comes from the next triplet + // + // shuffling changes the order of bytes: 1, 0, 2, 1 + // [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] + // ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^ + // processed bits inputVector = Ssse3.Shuffle(inputVector, localShuffleMask); + // unpacking + // t0 = [0000cccc|cc000000|aaaaaa00|00000000] Vector128 t0 = Sse2.And(inputVector, tt0); // t1 = [00000000|00cccccc|00000000|00aaaaaa] @@ -2578,12 +2594,13 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch // indices = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3 Vector128 indices = Sse2.Or(t1, t3); - // lookup function + // lookup function "Single pshufb method" (lookup_pshufb_improved) Vector128 result = Sse2.SubtractSaturate(indices, tt5); Vector128 compareResult = Sse2.CompareGreaterThan(tt7, indices.AsSByte()); result = Sse2.Or(result, Sse2.And(compareResult.AsByte(), tt8)); result = Ssse3.Shuffle(localShiftLut, result); result = Sse2.Add(result, indices); + // end of lookup function // save as two-bytes string, e.g.: // 1,2,3,4,5..16 => 1,0,2,0,3,0..16,0 From 72ea5503a68acdeac0260b69f0e381e1b245b5c8 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 24 Apr 2019 16:31:32 +0300 Subject: [PATCH 15/19] update THIRD-PARTY-NOTICES.TXT --- THIRD-PARTY-NOTICES.TXT | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 831105f181f3..8be51024d244 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -281,3 +281,35 @@ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +License notice for vectorized base64 encoding +-------------------------------------------------------- + +Copyright (c) 2005-2007, Nick Galbreath +Copyright (c) 2013-2017, Alfred Klomp +Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2016-2017, Matthieu Darbois +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file From aef8747ca89c8a2569c9bee767d796ec41d6296a Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 24 Apr 2019 16:40:36 +0300 Subject: [PATCH 16/19] update comments --- src/System.Private.CoreLib/shared/System/Convert.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index cd15b89cfdc8..03ce47748db7 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2541,13 +2541,13 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span 4, 0x80, 5, 0x80, 6, 0x80, 7, 0x80); + // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html (see THIRD-PARTY-NOTICES.txt) + // The original code can be found here: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.sse.cpp (and lookup_pshufb_improved as a lookup function) private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(char* outChars, byte* inData, int length, int offset, bool insertLineBreaks) { int i = offset, j = 0, charcount = 0; const int stride = 4 * 3; - // Based on "Base64 encoding with SIMD instructions" article by Wojciech Muła - // http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html byte* outputBytes = (byte*)outChars; Vector128 tt0 = Vector128.Create(0x0fc0fc00).AsByte(); From 3239269af5ce2ea0f5027601059c88e6705c390e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 4 Nov 2019 19:48:58 +0300 Subject: [PATCH 17/19] Fix build error (StoreScalar) --- src/System.Private.CoreLib/shared/System/Convert.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index 624b700dbfcc..bf5e3d1ed7e2 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2590,8 +2590,8 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch // but the following code just saves a half of the vector, then appends `\r\n` manually // and the second part of the vector is ignored (this is why 'i' is decremented) charcount = 0; - var shuffleResult = result.AsUInt64(); - Sse2.StoreLow((ulong*)(outputBytes + j), shuffleResult); + Vector128 shuffleResult = result.AsUInt64(); + Sse2.StoreScalar((ulong*)(outputBytes + j), shuffleResult); j += Vector128.Count / 2; outputBytes[j++] = (byte)'\r'; outputBytes[j++] = 0; From 77207a2b37e53f94903d0547f59a3de709b04867 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 4 Nov 2019 19:51:50 +0300 Subject: [PATCH 18/19] Update THIRD-PARTY-NOTICES.TXT --- THIRD-PARTY-NOTICES.TXT | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 01511175f6f9..59d1274050f6 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -358,3 +358,35 @@ License for fastmod (https://github.com/lemire/fastmod) WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +License notice for vectorized base64 encoding +-------------------------------------------------------- + +Copyright (c) 2005-2007, Nick Galbreath +Copyright (c) 2013-2017, Alfred Klomp +Copyright (c) 2015-2017, Wojciech Mula +Copyright (c) 2016-2017, Matthieu Darbois +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 55c7dacae0fd72bf8be606772f19b31db87cf0c3 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Mon, 4 Nov 2019 20:34:16 +0300 Subject: [PATCH 19/19] formatting --- src/System.Private.CoreLib/shared/System/Convert.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Convert.cs b/src/System.Private.CoreLib/shared/System/Convert.cs index bf5e3d1ed7e2..7dd7e27fd6fe 100644 --- a/src/System.Private.CoreLib/shared/System/Convert.cs +++ b/src/System.Private.CoreLib/shared/System/Convert.cs @@ -2492,7 +2492,7 @@ public static unsafe bool TryToBase64Chars(ReadOnlySpan bytes, Span charsWritten = ConvertToBase64Array(outChars, inData, 0, bytes.Length, insertLineBreaks); return true; } - } + } internal static readonly Vector128 s_base64ShuffleMask = Vector128.Create((byte) 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10); @@ -2581,12 +2581,12 @@ private static unsafe (int i, int j, int charcount) ConvertToBase64ArraySsse3(ch // Do it for the second part of the vector (rotate it first in order to re-use asciiToStringMaskLo) result = Sse2.Shuffle(result.AsUInt32(), 0x4E /*_MM_SHUFFLE(1,0,3,2)*/).AsByte(); result = Ssse3.Shuffle(result, localTwoBytesStringMaskLo); - + if (insertLineBreaks && (charcount += 16) >= base64LineBreakPosition) { - // Normally we save 32 bytes per iteration + // Normally we save 32 bytes per iteration // but `insertLineBreaks` needs `\r\n` (4 bytes) between each 76*2=152 bytes. 152/32 = 4.75 (means not a multiply of 32) - // we need to insert `\r\n` in the middle of Vector128 somehow + // we need to insert `\r\n` in the middle of Vector128 somehow // but the following code just saves a half of the vector, then appends `\r\n` manually // and the second part of the vector is ignored (this is why 'i' is decremented) charcount = 0;