Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve XmlDictionaryWriter UTF8 encoding performance #73336

Merged
merged 27 commits into from Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5d09005
Speed up text encoding
Daniel-Svensson Jul 7, 2022
63c760c
Update implementation
Daniel-Svensson Jul 18, 2022
196ce48
Add tests for binary xml strings
Daniel-Svensson Jul 26, 2022
65e7029
Merge branch 'binary_xml_text' of https://github.com/Daniel-Svensson/…
Daniel-Svensson Jul 26, 2022
4d8078a
limit counting code to 256 bit vectors
Daniel-Svensson Jul 26, 2022
6e5aabb
reword comment
Daniel-Svensson Aug 3, 2022
70fa189
rename test
Daniel-Svensson Aug 3, 2022
b34d259
move bytesmax
Daniel-Svensson Aug 3, 2022
5df5ae0
Fix bytesMax after moving variable initialization
Daniel-Svensson Aug 4, 2022
a790fbb
use unicode escape value in test
Daniel-Svensson Aug 4, 2022
2b82ac8
fix test typo "*" -> "+"
Daniel-Svensson Aug 4, 2022
301e531
Update src/libraries/System.Private.DataContractSerialization/src/Sys…
Daniel-Svensson Aug 12, 2022
5a21306
Remvoe vectorized code from UnsafeGetUTF8Length
Daniel-Svensson Aug 12, 2022
8a3de26
Merge remote-tracking branch 'upstream/main' into binary_xml_text
Daniel-Svensson Aug 13, 2022
048cade
Fix overfload
Daniel-Svensson Sep 8, 2022
8297311
Merge commit '080f708e7018f6c0529b6c875a44d84fc4d74419' into binary_x…
Daniel-Svensson Oct 24, 2022
287e737
use for loop which seems faster
Daniel-Svensson Oct 24, 2022
0d2a9bb
merge up to net8 preview1
Daniel-Svensson Mar 3, 2023
ab29682
remove vector loop
Daniel-Svensson Mar 6, 2023
251391f
make sealed encoding to allow devirtualisation
Daniel-Svensson Mar 11, 2023
a590739
back some changes
Daniel-Svensson Mar 20, 2023
46b6314
use uint for UnsafeGetUTF8Chars comparison
Daniel-Svensson Mar 25, 2023
82f8880
revert more changes
Daniel-Svensson Mar 26, 2023
d78aade
Fix cutoff based on new measurements
Daniel-Svensson Mar 26, 2023
3b20be8
use BinaryPrimitives.ReverseEndianness as suggested
Daniel-Svensson Mar 26, 2023
9c86b05
Update cutoff from 24 to 32 chars before calling, due to regression f…
Daniel-Svensson Mar 27, 2023
ccfb008
Remove sealed encoding since it only improves XmlConvert
Daniel-Svensson Apr 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -1,8 +1,10 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers.Binary;
using System.IO;
using System.Text;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
using System.Threading.Tasks;

Expand Down Expand Up @@ -334,34 +336,26 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)
}
}

protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
protected static unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
if (BitConverter.IsLittleEndian)
{
char value = *chars++;
buffer[offset++] = (byte)value;
value >>= 8;
buffer[offset++] = (byte)value;
new ReadOnlySpan<char>(chars, charCount)
.CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset)));
}
else
{
BinaryPrimitives.ReverseEndianness(new ReadOnlySpan<short>(chars, charCount),
MemoryMarshal.Cast<byte, short>(buffer.AsSpan(offset)));
}

return charCount * 2;
}

protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
{
if (*chars >= 0x80)
break;

chars++;
}

if (chars == charsMax)
return charCount;

return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars));
// Length will always be at least ( 128 / maxBytesPerChar) = 42
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount);
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
}

protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
Expand All @@ -370,39 +364,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
{
fixed (byte* _bytes = &buffer[offset])
{
byte* bytes = _bytes;
byte* bytesMax = &bytes[buffer.Length - offset];
char* charsMax = &chars[charCount];

while (true)
// Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible
if ((uint)charCount < 32)
{
byte* bytes = _bytes;
char* charsMax = &chars[charCount];

while (chars < charsMax)
{
char t = *chars;
if (t >= 0x80)
break;
goto NonAscii;

*bytes = (byte)t;
bytes++;
chars++;
}
return charCount;

if (chars >= charsMax)
break;

char* charsStart = chars;
while (chars < charsMax && *chars >= 0x80)
{
chars++;
}

bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));

if (chars >= charsMax)
break;
NonAscii:
byte* bytesMax = _bytes + buffer.Length - offset;
return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
}
else
{
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset);
}

return (int)(bytes - _bytes);
}
}
return 0;
Expand Down
Expand Up @@ -494,6 +494,71 @@ void AssertBytesWritten(Action<XmlDictionaryWriter> action, XmlBinaryNodeType no
}
}

[Fact]
public static void XmlBaseWriter_WriteString()
{
const byte Chars8Text = 152;
const byte Chars16Text = 154;
MemoryStream ms = new MemoryStream();
XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
writer.WriteStartElement("root");

int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
byte[] buffer = new byte[lengths.Max() + 1];

foreach (var length in lengths)
{
string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
});
string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
chars[^1] = '\u00E4'; // 'ä' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
});

int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
Assert.True(numBytes == length, "Test setup wrong - allAscii");
ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));

numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
}

static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
{
writer.Flush();
ms.Seek(0, SeekOrigin.Begin);
ms.SetLength(0);
writer.WriteString(text);
writer.Flush();

ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
ReadOnlySpan<byte> buffer = arraySegment;

if (expected.Length <= byte.MaxValue)
{
Assert.Equal(Chars8Text, buffer[0]);
Assert.Equal(expected.Length, buffer[1]);
buffer = buffer.Slice(2);
}
else if (expected.Length <= ushort.MaxValue)
{
Assert.Equal(Chars16Text, buffer[0]);
Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
buffer = buffer.Slice(3);
}
else
Assert.Fail("test use to long length");

AssertExtensions.SequenceEqual(expected, buffer);
}
}

private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
{
ms.Position = 0;
Expand Down