From cd06170c95f5dabd79682becbf278571c1df3269 Mon Sep 17 00:00:00 2001 From: kasperk81 Date: Mon, 19 Sep 2022 13:34:36 +0000 Subject: [PATCH] Remove custom Utf8 encoder implementation. --- .../src/System.Text.Json.csproj | 1 - .../System/Text/Json/Document/JsonDocument.cs | 7 +- .../System/Text/Json/Reader/Utf8JsonReader.cs | 7 +- .../Writer/JsonWriterHelper.Transcoding.cs | 339 ------------------ .../Text/Json/Writer/JsonWriterHelper.cs | 60 ++++ .../Utf8JsonWriter.WriteProperties.Helpers.cs | 4 +- .../Utf8JsonWriter.WriteValues.Comment.cs | 22 +- .../Utf8JsonWriterTests.cs | 17 +- 8 files changed, 91 insertions(+), 366 deletions(-) delete mode 100644 src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Transcoding.cs diff --git a/src/libraries/System.Text.Json/src/System.Text.Json.csproj b/src/libraries/System.Text.Json/src/System.Text.Json.csproj index 635be981ff62ca..6691df19ae1939 100644 --- a/src/libraries/System.Text.Json/src/System.Text.Json.csproj +++ b/src/libraries/System.Text.Json/src/System.Text.Json.csproj @@ -288,7 +288,6 @@ The System.Text.Json library is built-in as part of the shared framework in .NET - diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Document/JsonDocument.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Document/JsonDocument.cs index 45fa176cd2e4db..9042a31b9d07e5 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Document/JsonDocument.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Document/JsonDocument.cs @@ -296,19 +296,16 @@ internal bool TextEquals(int index, ReadOnlySpan otherText, bool isPropert stackalloc byte[JsonConstants.StackallocByteThreshold] : (otherUtf8TextArray = ArrayPool.Shared.Rent(length)); - ReadOnlySpan utf16Text = MemoryMarshal.AsBytes(otherText); - OperationStatus status = JsonWriterHelper.ToUtf8(utf16Text, otherUtf8Text, out int consumed, out int written); + OperationStatus status = JsonWriterHelper.ToUtf8(otherText, otherUtf8Text, out int written); Debug.Assert(status != OperationStatus.DestinationTooSmall); bool result; - if (status > OperationStatus.DestinationTooSmall) // Equivalent to: (status == NeedMoreData || status == InvalidData) + if (status == OperationStatus.InvalidData) { result = false; } else { Debug.Assert(status == OperationStatus.Done); - Debug.Assert(consumed == utf16Text.Length); - result = TextEquals(index, otherUtf8Text.Slice(0, written), isPropertyName, shouldUnescape: true); } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/Utf8JsonReader.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/Utf8JsonReader.cs index eb8518376681c6..d5d3bf74b822ec 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Reader/Utf8JsonReader.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Reader/Utf8JsonReader.cs @@ -531,19 +531,16 @@ public readonly bool ValueTextEquals(ReadOnlySpan text) otherUtf8Text = stackalloc byte[JsonConstants.StackallocByteThreshold]; } - ReadOnlySpan utf16Text = MemoryMarshal.AsBytes(text); - OperationStatus status = JsonWriterHelper.ToUtf8(utf16Text, otherUtf8Text, out int consumed, out int written); + OperationStatus status = JsonWriterHelper.ToUtf8(text, otherUtf8Text, out int written); Debug.Assert(status != OperationStatus.DestinationTooSmall); bool result; - if (status > OperationStatus.DestinationTooSmall) // Equivalent to: (status == NeedMoreData || status == InvalidData) + if (status == OperationStatus.InvalidData) { result = false; } else { Debug.Assert(status == OperationStatus.Done); - Debug.Assert(consumed == utf16Text.Length); - result = TextEqualsHelper(otherUtf8Text.Slice(0, written)); } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Transcoding.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Transcoding.cs deleted file mode 100644 index 71514d77d36493..00000000000000 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.Transcoding.cs +++ /dev/null @@ -1,339 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Buffers; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -namespace System.Text.Json -{ - internal static partial class JsonWriterHelper - { - // TODO: Replace this with publicly shipping implementation: https://github.com/dotnet/runtime/issues/28204 - /// - /// Converts a span containing a sequence of UTF-16 bytes into UTF-8 bytes. - /// - /// This method will consume as many of the input bytes as possible. - /// - /// On successful exit, the entire input was consumed and encoded successfully. In this case, will be - /// equal to the length of the and will equal the total number of bytes written to - /// the . - /// - /// A span containing a sequence of UTF-16 bytes. - /// A span to write the UTF-8 bytes into. - /// On exit, contains the number of bytes that were consumed from the . - /// On exit, contains the number of bytes written to - /// A value representing the state of the conversion. - public static unsafe OperationStatus ToUtf8(ReadOnlySpan utf16Source, Span utf8Destination, out int bytesConsumed, out int bytesWritten) - { - // - // - // KEEP THIS IMPLEMENTATION IN SYNC WITH https://github.com/dotnet/runtime/blob/main/src/libraries/System.Private.CoreLib/src/System/Text/UTF8Encoding.cs#L845 - // - // - fixed (byte* chars = &MemoryMarshal.GetReference(utf16Source)) - fixed (byte* bytes = &MemoryMarshal.GetReference(utf8Destination)) - { - char* pSrc = (char*)chars; - byte* pTarget = bytes; - - char* pEnd = (char*)(chars + utf16Source.Length); - byte* pAllocatedBufferEnd = pTarget + utf8Destination.Length; - - // assume that JIT will enregister pSrc, pTarget and ch - - // Entering the fast encoding loop incurs some overhead that does not get amortized for small - // number of characters, and the slow encoding loop typically ends up running for the last few - // characters anyway since the fast encoding loop needs 5 characters on input at least. - // Thus don't use the fast decoding loop at all if we don't have enough characters. The threashold - // was chosen based on performance testing. - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - while (pEnd - pSrc > 13) - { - // we need at least 1 byte per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - int available = Math.Min(PtrDiff(pEnd, pSrc), PtrDiff(pAllocatedBufferEnd, pTarget)); - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - char* pStop = pSrc + available - 5; - if (pSrc >= pStop) - break; - - do - { - int ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - - // get pSrc aligned - if ((unchecked((int)pSrc) & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & unchecked((int)0xFF80FF80)) != 0) - { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianness sensitive - if (!BitConverter.IsLittleEndian) - { - *pTarget = (byte)(ch >> 16); - *(pTarget + 1) = (byte)ch; - pSrc += 4; - *(pTarget + 2) = (byte)(chc >> 16); - *(pTarget + 3) = (byte)chc; - pTarget += 4; - } - else - { - *pTarget = (byte)ch; - *(pTarget + 1) = (byte)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (byte)chc; - *(pTarget + 3) = (byte)(chc >> 16); - pTarget += 4; - } - } - continue; - - LongCodeWithMask: - if (!BitConverter.IsLittleEndian) - { - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - } - else - { - ch = (char)ch; - } - pSrc++; - - if (ch > 0x7F) - { - goto LongCode; - } - *pTarget = (byte)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) - { - // 2 byte encoding - chd = unchecked((sbyte)0xC0) | (ch >> 6); - } - else - { - // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch)) - if (!JsonHelpers.IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd)) - { - // 3 byte encoding - chd = unchecked((sbyte)0xE0) | (ch >> 12); - } - else - { - // 4 byte encoding - high surrogate + low surrogate - // if (!IsHighSurrogate(ch)) - if (ch > JsonConstants.HighSurrogateEnd) - { - // low without high -> bad - goto InvalidData; - } - - chd = *pSrc; - - // if (!IsLowSurrogate(chd)) { - if (!JsonHelpers.IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd)) - { - // high not followed by low -> bad - goto InvalidData; - } - - pSrc++; - - ch = chd + (ch << 10) + - (0x10000 - - JsonConstants.LowSurrogateStart - - (JsonConstants.HighSurrogateStart << 10)); - - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - // pStop - this byte is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 3 byte sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - *pTarget = (byte)chd; - pStop--; // 2 byte sequence for 1 char so need pStop--. - - *(pTarget + 1) = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - // pStop - this byte is already included - - pTarget += 2; - } - while (pSrc < pStop); - - Debug.Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); - } - - while (pSrc < pEnd) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int ch = *pSrc; - pSrc++; - - if (ch <= 0x7F) - { - if (pAllocatedBufferEnd - pTarget <= 0) - goto DestinationFull; - - *pTarget = (byte)ch; - pTarget++; - continue; - } - - int chd; - if (ch <= 0x7FF) - { - if (pAllocatedBufferEnd - pTarget <= 1) - goto DestinationFull; - - // 2 byte encoding - chd = unchecked((sbyte)0xC0) | (ch >> 6); - } - else - { - // if (!IsLowSurrogate(ch) && !IsHighSurrogate(ch)) - if (!JsonHelpers.IsInRangeInclusive(ch, JsonConstants.HighSurrogateStart, JsonConstants.LowSurrogateEnd)) - { - if (pAllocatedBufferEnd - pTarget <= 2) - goto DestinationFull; - - // 3 byte encoding - chd = unchecked((sbyte)0xE0) | (ch >> 12); - } - else - { - if (pAllocatedBufferEnd - pTarget <= 3) - goto DestinationFull; - - // 4 byte encoding - high surrogate + low surrogate - // if (!IsHighSurrogate(ch)) - if (ch > JsonConstants.HighSurrogateEnd) - { - // low without high -> bad - goto InvalidData; - } - - if (pSrc >= pEnd) - goto NeedMoreData; - - chd = *pSrc; - - // if (!IsLowSurrogate(chd)) { - if (!JsonHelpers.IsInRangeInclusive(chd, JsonConstants.LowSurrogateStart, JsonConstants.LowSurrogateEnd)) - { - // high not followed by low -> bad - goto InvalidData; - } - - pSrc++; - - ch = chd + (ch << 10) + - (0x10000 - - JsonConstants.LowSurrogateStart - - (JsonConstants.HighSurrogateStart << 10)); - - *pTarget = (byte)(unchecked((sbyte)0xF0) | (ch >> 18)); - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 12) & 0x3F; - } - *pTarget = (byte)chd; - pTarget++; - - chd = unchecked((sbyte)0x80) | (ch >> 6) & 0x3F; - } - - *pTarget = (byte)chd; - *(pTarget + 1) = (byte)(unchecked((sbyte)0x80) | ch & 0x3F); - - pTarget += 2; - } - - bytesConsumed = (int)((byte*)pSrc - chars); - bytesWritten = (int)(pTarget - bytes); - return OperationStatus.Done; - - InvalidData: - bytesConsumed = (int)((byte*)(pSrc - 1) - chars); - bytesWritten = (int)(pTarget - bytes); - return OperationStatus.InvalidData; - - DestinationFull: - bytesConsumed = (int)((byte*)(pSrc - 1) - chars); - bytesWritten = (int)(pTarget - bytes); - return OperationStatus.DestinationTooSmall; - - NeedMoreData: - bytesConsumed = (int)((byte*)(pSrc - 1) - chars); - bytesWritten = (int)(pTarget - bytes); - return OperationStatus.NeedMoreData; - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int PtrDiff(char* a, char* b) - { - return (int)(((uint)((byte*)a - (byte*)b)) >> 1); - } - - // byte* flavor just for parity - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int PtrDiff(byte* a, byte* b) - { - return (int)(a - b); - } - } -} diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs index 86a9732192ed2a..ffb522cf876a49 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/JsonWriterHelper.cs @@ -1,8 +1,10 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Text.Unicode; namespace System.Text.Json { @@ -229,5 +231,63 @@ internal static void ValidateNumber(ReadOnlySpan utf8FormattedNumber) nameof(utf8FormattedNumber)); } } + + private static readonly UTF8Encoding s_utf8Encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); + + public static unsafe bool IsValidUtf8String(ReadOnlySpan bytes) + { + try + { +#if NETCOREAPP + s_utf8Encoding.GetCharCount(bytes); +#else + if (!bytes.IsEmpty) + { + fixed (byte* ptr = bytes) + { + s_utf8Encoding.GetCharCount(ptr, bytes.Length); + } + } +#endif + return true; + } + catch (DecoderFallbackException) + { + return false; + } + } + + internal static unsafe OperationStatus ToUtf8(ReadOnlySpan source, Span destination, out int written) + { +#if NETCOREAPP + OperationStatus status = Utf8.FromUtf16(source, destination, out int charsRead, out written, replaceInvalidSequences: false, isFinalBlock: true); + Debug.Assert(status is OperationStatus.Done or OperationStatus.DestinationTooSmall or OperationStatus.InvalidData); + Debug.Assert(charsRead == source.Length || status is not OperationStatus.Done); + return status; +#else + written = 0; + try + { + if (!source.IsEmpty) + { + fixed (char* charPtr = source) + fixed (byte* destPtr = destination) + { + written = s_utf8Encoding.GetBytes(charPtr, source.Length, destPtr, destination.Length); + } + } + + return OperationStatus.Done; + } + catch (EncoderFallbackException) + { + return OperationStatus.InvalidData; + } + catch (ArgumentException) + { + return OperationStatus.DestinationTooSmall; + } +#endif + } } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs index 0c68ef88f8fb92..cd56c0a99aa9e3 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteProperties.Helpers.cs @@ -205,10 +205,8 @@ private void WritePropertyNameIndented(ReadOnlySpan escapedPropertyName, b [MethodImpl(MethodImplOptions.AggressiveInlining)] private void TranscodeAndWrite(ReadOnlySpan escapedPropertyName, Span output) { - ReadOnlySpan byteSpan = MemoryMarshal.AsBytes(escapedPropertyName); - OperationStatus status = JsonWriterHelper.ToUtf8(byteSpan, output.Slice(BytesPending), out int consumed, out int written); + OperationStatus status = JsonWriterHelper.ToUtf8(escapedPropertyName, output.Slice(BytesPending), out int written); Debug.Assert(status == OperationStatus.Done); - Debug.Assert(consumed == byteSpan.Length); BytesPending += written; } } diff --git a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs index 4da708c6eb0430..df318e58717297 100644 --- a/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs +++ b/src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Comment.cs @@ -3,7 +3,6 @@ using System.Buffers; using System.Diagnostics; -using System.Runtime.InteropServices; namespace System.Text.Json { @@ -86,9 +85,13 @@ private void WriteCommentMinimized(ReadOnlySpan value) output[BytesPending++] = JsonConstants.Slash; output[BytesPending++] = JsonConstants.Asterisk; - ReadOnlySpan byteSpan = MemoryMarshal.AsBytes(value); - OperationStatus status = JsonWriterHelper.ToUtf8(byteSpan, output.Slice(BytesPending), out int _, out int written); + OperationStatus status = JsonWriterHelper.ToUtf8(value, output.Slice(BytesPending), out int written); Debug.Assert(status != OperationStatus.DestinationTooSmall); + if (status == OperationStatus.InvalidData) + { + ThrowHelper.ThrowArgumentException_InvalidUTF16(value[written]); + } + BytesPending += written; output[BytesPending++] = JsonConstants.Asterisk; @@ -124,9 +127,13 @@ private void WriteCommentIndented(ReadOnlySpan value) output[BytesPending++] = JsonConstants.Slash; output[BytesPending++] = JsonConstants.Asterisk; - ReadOnlySpan byteSpan = MemoryMarshal.AsBytes(value); - OperationStatus status = JsonWriterHelper.ToUtf8(byteSpan, output.Slice(BytesPending), out int _, out int written); + OperationStatus status = JsonWriterHelper.ToUtf8(value, output.Slice(BytesPending), out int written); Debug.Assert(status != OperationStatus.DestinationTooSmall); + if (status == OperationStatus.InvalidData) + { + ThrowHelper.ThrowArgumentException_InvalidUTF16(value[written]); + } + BytesPending += written; output[BytesPending++] = JsonConstants.Asterisk; @@ -152,6 +159,11 @@ public void WriteCommentValue(ReadOnlySpan utf8Value) ThrowHelper.ThrowArgumentException_InvalidCommentValue(); } + if (!JsonWriterHelper.IsValidUtf8String(utf8Value)) + { + ThrowHelper.ThrowArgumentException_InvalidUTF8(utf8Value); + } + WriteCommentByOptions(utf8Value); } diff --git a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs index 4868c2591071c1..9562636cfce0d6 100644 --- a/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs +++ b/src/libraries/System.Text.Json/tests/System.Text.Json.Tests/Utf8JsonWriterTests.cs @@ -4134,10 +4134,16 @@ public void WriteInvalidComment(bool formatted, bool skipValidation) using var jsonUtf8 = new Utf8JsonWriter(output, options); string comment = "comment is */ invalid"; - Assert.Throws(() => jsonUtf8.WriteCommentValue(comment)); Assert.Throws(() => jsonUtf8.WriteCommentValue(comment.AsSpan())); Assert.Throws(() => jsonUtf8.WriteCommentValue(Encoding.UTF8.GetBytes(comment))); + + comment = "comment with unpaired surrogate \udc00"; + Assert.Throws(() => jsonUtf8.WriteCommentValue(comment)); + Assert.Throws(() => jsonUtf8.WriteCommentValue(comment.AsSpan())); + + var invalidUtf8 = new byte[2] { 0xc3, 0x28 }; + Assert.Throws(() => jsonUtf8.WriteCommentValue(invalidUtf8)); } [Theory] @@ -4162,16 +4168,11 @@ public void WriteCommentsInvalidTextAllowed(bool formatted, bool skipValidation) jsonUtf8.WriteCommentValue(comment.AsSpan()); jsonUtf8.WriteCommentValue(Encoding.UTF8.GetBytes(comment)); - comment = "comment is / * valid even with unpaired surrogate \udc00 this part no longer visible"; + comment = "comment is / * valid"; jsonUtf8.WriteCommentValue(comment); jsonUtf8.WriteCommentValue(comment.AsSpan()); jsonUtf8.Flush(); - - // Explicitly skipping flushing here - var invalidUtf8 = new byte[2] { 0xc3, 0x28 }; - jsonUtf8.WriteCommentValue(invalidUtf8); - string expectedStr = GetCommentExpectedString(prettyPrint: formatted); JsonTestHelper.AssertContents(expectedStr, output); } @@ -4197,7 +4198,7 @@ private static string GetCommentExpectedString(bool prettyPrint) json.WriteComment(comment); json.WriteComment(comment); - comment = "comment is / * valid even with unpaired surrogate "; + comment = "comment is / * valid"; json.WriteComment(comment); json.WriteComment(comment);