Unroll ARM64 ASCII narrowing loop 2x in NarrowUtf16ToAscii#126658
Closed
EgorBo wants to merge 1 commit intodotnet:mainfrom
Closed
Unroll ARM64 ASCII narrowing loop 2x in NarrowUtf16ToAscii#126658EgorBo wants to merge 1 commit intodotnet:mainfrom
EgorBo wants to merge 1 commit intodotnet:mainfrom
Conversation
Contributor
|
Tagging subscribers to this area: @dotnet/area-system-text-json |
Member
Author
|
@EgorBot -linux_azure_arm -linux_aws_amd -linux_aws_arm -profiler using System.Text;
using System.Text.Json;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
BenchmarkSwitcher.FromAssembly(typeof(Benchmarks).Assembly).Run(args);
[MemoryDiagnoser]
public class Benchmarks
{
// ── TokenSerialization fields ────────────────────────────────────────────
private List<object> _tokenObjects;
[ThreadStatic] static Utf8JsonWriter t_writer;
[ThreadStatic] static MemoryStream t_stream;
[GlobalSetup]
public void Setup()
{
// TokenSerialization
_tokenObjects = new List<object>(200);
for (int i = 0; i < 200; i++)
{
if (i % 3 == 0)
_tokenObjects.Add(GenerateRecordJson(1));
else
_tokenObjects.Add(new Dictionary<string, object>
{
["seq"] = i,
["label"] = $"item_{i}",
["blob"] = new byte[100]
});
}
}
private static string GenerateRecordJson(int targetSizeKb = 150)
{
var sb = new StringBuilder(targetSizeKb * 1024 + 512);
sb.Append("{");
sb.Append("\"TypeName\":\"product\",");
sb.Append("\"CategoryCode\":1,");
sb.Append("\"Label\":\"Product\",");
sb.Append("\"IsAction\":false,");
sb.Append("\"IsActionMember\":false,");
sb.Append("\"IsTrackingEnabled\":true,");
sb.Append("\"IsAvailableLocal\":true,");
sb.Append("\"IsChildRecord\":false,");
sb.Append("\"IsLinksEnabled\":true,");
sb.Append("\"IsCustomRecord\":false,");
sb.Append("\"PrimaryKeyField\":\"productid\",");
sb.Append("\"PrimaryLabelField\":\"title\",");
sb.Append("\"Fields\":[");
int targetBytes = targetSizeKb * 1024;
int fieldIndex = 0;
bool firstField = true;
while (sb.Length < targetBytes - 512)
{
if (!firstField) sb.Append(",");
firstField = false;
sb.Append("{");
sb.Append($"\"TypeName\":\"field_{fieldIndex}\",");
sb.Append($"\"InternalName\":\"Field_{fieldIndex}\",");
sb.Append($"\"FieldType\":\"String\",");
sb.Append($"\"Label\":\"Field {fieldIndex}\",");
sb.Append($"\"MaxSize\":100,");
sb.Append($"\"IsReadable\":true,");
sb.Append($"\"IsCreatable\":true,");
sb.Append($"\"IsUpdatable\":true,");
sb.Append($"\"IsTrackingEnabled\":false,");
sb.Append($"\"IsPrimaryKey\":false,");
sb.Append($"\"IsVirtual\":false,");
sb.Append($"\"Requirement\":\"None\"");
sb.Append("}");
fieldIndex++;
}
sb.Append("]");
sb.Append("}");
return sb.ToString();
}
[Benchmark]
public void TokenSerialization()
{
var stream = t_stream ??= new MemoryStream(64 * 1024);
stream.Position = 0;
stream.SetLength(0);
var writer = t_writer;
if (writer == null)
{
writer = new Utf8JsonWriter(stream, new JsonWriterOptions { SkipValidation = true });
t_writer = writer;
}
else
writer.Reset(stream);
writer.WriteStartObject();
writer.WriteStartArray("Catalog");
foreach (var token in _tokenObjects)
{
if (token is string strToken)
{
if (!string.IsNullOrEmpty(strToken))
writer.WriteRawValue(strToken);
}
else if (token is Dictionary<string, object> dictToken)
{
writer.WriteStartObject();
foreach (var kvp in dictToken)
{
writer.WritePropertyName(kvp.Key);
JsonSerializer.Serialize(writer, kvp.Value);
}
writer.WriteEndObject();
}
}
writer.WriteEndArray();
writer.WriteEndObject();
writer.Flush();
if (stream.Length == 0) throw new Exception("unreachable");
}
} |
Contributor
There was a problem hiding this comment.
Pull request overview
This PR optimizes the UTF-16 Utf8JsonWriter.WriteRawValue(string/ReadOnlySpan<char>) path by transcoding directly into the writer’s output buffer and validating in-place, eliminating the temporary buffer + copy used previously.
Changes:
- Removes the temporary UTF-8 buffer (stackalloc/ArrayPool/normal alloc) and the subsequent copy into the writer buffer.
- Implements a delayed-commit approach so writer state is only updated after validation succeeds.
- Adds an early empty-input check in the UTF-16 raw-value path.
src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Raw.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf8JsonWriter.WriteValues.Raw.cs
Outdated
Show resolved
Hide resolved
…ffer Eliminate temporary buffer allocation and extra copy in the UTF-16 WriteRawValue path by transcoding directly into the writer's output buffer and validating in-place. Before: rent temp buffer -> transcode to temp -> validate temp -> copy to output -> return buffer After: grow output buffer -> transcode to output -> validate in-place -> commit state This removes per-call ArrayPool rent/clear/return overhead and one full buffer copy. Uses a delayed-commit pattern where no writer state (BytesPending, _tokenType, separator flag) is mutated until validation succeeds, keeping error handling safe. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
77a9e15 to
9a780ae
Compare
This was referenced Apr 8, 2026
Open
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Unroll the ARM64 inner loop in
NarrowUtf16ToAscii_Intrinsifiedto process 32 chars per iteration (4 Vector128 loads → 2 narrows → 2 stores) instead of 16. This better utilizes the dual vector execution units on modern ARM64 cores (Neoverse-V2/N2, Cortex-X series) and matches AVX2 throughput while staying within the 128-bit NEON vector constraint.Background
Investigating a ~1.7x performance gap between ARM64 (Graviton4/Cobalt100) and x64 (AMD EPYC) on a JSON serialization benchmark (EgorBot/Benchmarks#94), profiling identified
Ascii.NarrowUtf16ToAsciias a key contributor. The function dispatches to:_Intrinsified_512_Intrinsified_256_IntrinsifiedDesign
AdvSimd.IsSupported(JIT-time constant), so it's dead-code-eliminated on x64do-whileloop is changed towhileto safely handle the case where the unrolled loop consumed all available data