-
Notifications
You must be signed in to change notification settings - Fork 4.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: JsonEncoder (rather than JavaScriptEncoder) #86810
Comments
Tagging subscribers to this area: @dotnet/area-system-text-json, @gregsdennis Issue DetailsBackground and motivationEncoding JSON today is unnecessarily complicated in multiple ways:
A better use of the Strategy Pattern here allows a JSON encoder to decide only within the range of actual valid options for JSON encoding - whether to encode a character, and, if so, which of a limited number of valid encoding variants to use. API Proposalusing System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
public abstract class JsonEncoder
{
public abstract int FindFirstCharacterToEncode(Span<char> text);
public abstract JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar);
public abstract bool WillEncode(int unicodeScalar);
}
public struct JsonEncodeStyle
{
public JsonEncodeStyle()
{
}
public JsonEncodeStyle(bool preferTwoCharacterEscape, bool preferUppercase)
{
PreferTwoCharacterEscape = preferTwoCharacterEscape;
PreferUppercase = preferUppercase;
}
public readonly bool PreferTwoCharacterEscape;
public readonly bool PreferUppercase;
} To integrate with the existing JavaScriptEncoder, code like this could be used, though ideally the JsonSerializer could also accept the new encoder type directly (perhaps Encoder = vs JsonEncoder = on JsonSerializerOptions, though that's a separate question): public class JavaScriptJsonEncoderAdapter : JavaScriptEncoder
{
const int unicodeEscapeLength = 6; // "\\u0000".Length
readonly JsonEncoder inner;
public JavaScriptJsonEncoderAdapter(JsonEncoder inner)
{
if (inner == null)
{
throw new ArgumentNullException(nameof(inner));
}
this.inner = inner;
}
public override int MaxOutputCharactersPerInputCharacter => unicodeEscapeLength;
public override unsafe int FindFirstCharacterToEncode(char* text, int textLength)
{
if (textLength == 0)
{
return -1;
}
int minimalEncodeIndex = -1;
for (int index = 0; index < textLength; ++index)
{
if (IsAlwaysEscaped(text[index]))
{
minimalEncodeIndex = index;
break;
}
}
int innerEncodeIndex = inner.FindFirstCharacterToEncode(new Span<char>(text, textLength));
if (innerEncodeIndex == -1)
{
return minimalEncodeIndex;
}
if (minimalEncodeIndex == -1)
{
return innerEncodeIndex;
}
return minimalEncodeIndex < innerEncodeIndex ? minimalEncodeIndex : innerEncodeIndex;
}
public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength,
out int numberOfCharactersWritten)
{
const int minLengthNeeded = 2; // """\b""".Length
if (bufferLength < minLengthNeeded)
{
numberOfCharactersWritten = 0;
return false;
}
JsonEncodeStyle encodeStyle = inner.EncodeUnicodeScalar(unicodeScalar);
Rune rune = new Rune(unicodeScalar);
bool useTwoCharacterEscape = unicodeScalar < char.MaxValue && HasTwoCharacterEscape((char)unicodeScalar) &&
encodeStyle.PreferTwoCharacterEscape;
int utf16SequenceLength = rune.Utf16SequenceLength;
bool requiresSurrogatePair = utf16SequenceLength == 2;
int lengthNeeded = useTwoCharacterEscape ? 2 : utf16SequenceLength * unicodeEscapeLength;
if (bufferLength < lengthNeeded)
{
numberOfCharactersWritten = 0;
return false;
}
Span<char> span = new Span<char>(buffer, bufferLength);
if (useTwoCharacterEscape)
{
span[0] = '\\';
span[1] = GetTwoCharacterEscapeSuffix((char)unicodeScalar);
numberOfCharactersWritten = lengthNeeded;
return true;
}
bool preferUppercase = encodeStyle.PreferUppercase;
if (requiresSurrogatePair)
{
const int maxCharsPerScalar = 2;
char* utf16Buffer = stackalloc char[maxCharsPerScalar];
rune.EncodeToUtf16(new Span<char>(utf16Buffer, maxCharsPerScalar));
char highSurrogate = utf16Buffer[0];
char lowSurrogate = utf16Buffer[1];
span[0] = '\\';
span[1] = 'u';
span[2] = ToHexDigit((highSurrogate & 0xf000) >> 12, preferUppercase);
span[3] = ToHexDigit((highSurrogate & 0xf00) >> 8, preferUppercase);
span[4] = ToHexDigit((highSurrogate & 0xf0) >> 4, preferUppercase);
span[5] = ToHexDigit(highSurrogate & 0xf, preferUppercase);
span[6] = '\\';
span[7] = 'u';
span[8] = ToHexDigit((lowSurrogate & 0xf000) >> 12, preferUppercase);
span[9] = ToHexDigit((lowSurrogate & 0xf00) >> 8, preferUppercase);
span[10] = ToHexDigit((lowSurrogate & 0xf0) >> 4, preferUppercase);
span[11] = ToHexDigit(lowSurrogate & 0xf, preferUppercase);
numberOfCharactersWritten = lengthNeeded;
return true;
}
char toEncode = (char)unicodeScalar;
span[0] = '\\';
span[1] = 'u';
span[2] = ToHexDigit((toEncode & 0xf000) >> 12, preferUppercase);
span[3] = ToHexDigit((toEncode & 0xf00) >> 8, preferUppercase);
span[4] = ToHexDigit((toEncode & 0xf0) >> 4, preferUppercase);
span[5] = ToHexDigit(toEncode & 0xf, preferUppercase);
numberOfCharactersWritten = lengthNeeded;
return true;
}
public override bool WillEncode(int unicodeScalar)
{
return (unicodeScalar < char.MaxValue && IsAlwaysEscaped((char)unicodeScalar)) ||
inner.WillEncode(unicodeScalar);
}
public static bool HasTwoCharacterEscape(char value)
{
// RFC 8259, Section 7, "char = " BNF
switch (value)
{
case '"':
case '\\':
case '/':
case '\b':
case '\f':
case '\n':
case '\r':
case '\t':
return true;
default:
return false;
}
}
public static bool IsAlwaysEscaped(char value)
{
// RFC 8259, Section 7, first paragraph
return value == '"' || value == '\\' || value < '\u001f';
}
static char GetTwoCharacterEscapeSuffix(char value)
{
// RFC 8259, Section 7, "char = " BNF
switch (value)
{
case '"':
return '"';
case '\\':
return '\\';
case '/':
return '/';
case '\b':
return 'b';
case '\f':
return 'f';
case '\n':
return 'n';
case '\r':
return 'r';
case '\t':
return 't';
default:
throw new ArgumentOutOfRangeException(nameof(value));
}
}
static char ToHexDigit(int value, bool preferUppercase)
{
if (value > 0xf)
{
throw new ArgumentOutOfRangeException(nameof(value));
}
if (value <= 10)
{
return (char)(value + '0');
}
else
{
return (char)(value - 0xa + (preferUppercase ? 'A' : 'a'));
}
}
} API Usageusing System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
class MinimalJsonEncoder : JsonEncoder
{
public override int FindFirstCharacterToEncode(Span<char> text)
{
return -1;
}
public override JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar)
{
return default;
}
public override bool WillEncode(int unicodeScalar)
{
return false;
}
}
class MaximalJsonEncoder : JsonEncoder
{
public override int FindFirstCharacterToEncode(Span<char> text)
{
if (text.Length == 0)
{
return -1;
}
return 0;
}
public override JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar)
{
return default;
}
public override bool WillEncode(int unicodeScalar)
{
return true;
}
}
// Or similarly to choose some in-between set of characters to encode.
class Program
{
static void Main()
{
Console.OutputEncoding = Encoding.UTF8;
List<string> data = new List<string>();
data.Add("English");
data.Add("日本語");
data.Add("עברית");
data.Add("ქართული ენა");
data.Add("中文");
data.Add("𑄌𑄋𑄴𑄟𑄳𑄦 𑄃𑄧𑄏𑄛𑄖𑄴");
string actual = JsonSerializer.Serialize(data, new JsonSerializerOptions { Encoder =
new JavaScriptJsonEncoderAdapter(new MinimalJsonEncoder()) });
Console.WriteLine(actual);
}
} Alternative DesignsThe adapter above ensures the JSON output is always valid, at the expense of having both FindFirstCharacterToEncode and WillEncode both do some extra processing, to determine if a character must be encoded in order for the JSON to be valid. A less-safe but higher-performance alternative would be to require every JsonEncoder implementation to escape the required characters. (So the correct MinimalJsonEncoder usage above would have to encode ", \, and control characters.) RisksNo response
|
Tagging subscribers to this area: @dotnet/area-system-text-encodings-web Issue DetailsBackground and motivationEncoding JSON today is unnecessarily complicated in multiple ways:
A better use of the Strategy Pattern here allows a JSON encoder to decide only within the range of actual valid options for JSON encoding - whether to encode a character, and, if so, which of a limited number of valid encoding variants to use. API Proposalusing System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
public abstract class JsonEncoder
{
public abstract int FindFirstCharacterToEncode(Span<char> text);
public abstract JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar);
public abstract bool WillEncode(int unicodeScalar);
}
public struct JsonEncodeStyle
{
public JsonEncodeStyle()
{
}
public JsonEncodeStyle(bool preferTwoCharacterEscape, bool preferUppercase)
{
PreferTwoCharacterEscape = preferTwoCharacterEscape;
PreferUppercase = preferUppercase;
}
public readonly bool PreferTwoCharacterEscape;
public readonly bool PreferUppercase;
} To integrate with the existing JavaScriptEncoder, code like this could be used, though ideally the JsonSerializer could also accept the new encoder type directly (perhaps Encoder = vs JsonEncoder = on JsonSerializerOptions, though that's a separate question): public class JavaScriptJsonEncoderAdapter : JavaScriptEncoder
{
const int unicodeEscapeLength = 6; // "\\u0000".Length
readonly JsonEncoder inner;
public JavaScriptJsonEncoderAdapter(JsonEncoder inner)
{
if (inner == null)
{
throw new ArgumentNullException(nameof(inner));
}
this.inner = inner;
}
public override int MaxOutputCharactersPerInputCharacter => unicodeEscapeLength;
public override unsafe int FindFirstCharacterToEncode(char* text, int textLength)
{
if (textLength == 0)
{
return -1;
}
int minimalEncodeIndex = -1;
for (int index = 0; index < textLength; ++index)
{
if (IsAlwaysEscaped(text[index]))
{
minimalEncodeIndex = index;
break;
}
}
int innerEncodeIndex = inner.FindFirstCharacterToEncode(new Span<char>(text, textLength));
if (innerEncodeIndex == -1)
{
return minimalEncodeIndex;
}
if (minimalEncodeIndex == -1)
{
return innerEncodeIndex;
}
return minimalEncodeIndex < innerEncodeIndex ? minimalEncodeIndex : innerEncodeIndex;
}
public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength,
out int numberOfCharactersWritten)
{
const int minLengthNeeded = 2; // """\b""".Length
if (bufferLength < minLengthNeeded)
{
numberOfCharactersWritten = 0;
return false;
}
JsonEncodeStyle encodeStyle = inner.EncodeUnicodeScalar(unicodeScalar);
Rune rune = new Rune(unicodeScalar);
bool useTwoCharacterEscape = unicodeScalar < char.MaxValue && HasTwoCharacterEscape((char)unicodeScalar) &&
encodeStyle.PreferTwoCharacterEscape;
int utf16SequenceLength = rune.Utf16SequenceLength;
bool requiresSurrogatePair = utf16SequenceLength == 2;
int lengthNeeded = useTwoCharacterEscape ? 2 : utf16SequenceLength * unicodeEscapeLength;
if (bufferLength < lengthNeeded)
{
numberOfCharactersWritten = 0;
return false;
}
Span<char> span = new Span<char>(buffer, bufferLength);
if (useTwoCharacterEscape)
{
span[0] = '\\';
span[1] = GetTwoCharacterEscapeSuffix((char)unicodeScalar);
numberOfCharactersWritten = lengthNeeded;
return true;
}
bool preferUppercase = encodeStyle.PreferUppercase;
if (requiresSurrogatePair)
{
const int maxCharsPerScalar = 2;
char* utf16Buffer = stackalloc char[maxCharsPerScalar];
rune.EncodeToUtf16(new Span<char>(utf16Buffer, maxCharsPerScalar));
char highSurrogate = utf16Buffer[0];
char lowSurrogate = utf16Buffer[1];
span[0] = '\\';
span[1] = 'u';
span[2] = ToHexDigit((highSurrogate & 0xf000) >> 12, preferUppercase);
span[3] = ToHexDigit((highSurrogate & 0xf00) >> 8, preferUppercase);
span[4] = ToHexDigit((highSurrogate & 0xf0) >> 4, preferUppercase);
span[5] = ToHexDigit(highSurrogate & 0xf, preferUppercase);
span[6] = '\\';
span[7] = 'u';
span[8] = ToHexDigit((lowSurrogate & 0xf000) >> 12, preferUppercase);
span[9] = ToHexDigit((lowSurrogate & 0xf00) >> 8, preferUppercase);
span[10] = ToHexDigit((lowSurrogate & 0xf0) >> 4, preferUppercase);
span[11] = ToHexDigit(lowSurrogate & 0xf, preferUppercase);
numberOfCharactersWritten = lengthNeeded;
return true;
}
char toEncode = (char)unicodeScalar;
span[0] = '\\';
span[1] = 'u';
span[2] = ToHexDigit((toEncode & 0xf000) >> 12, preferUppercase);
span[3] = ToHexDigit((toEncode & 0xf00) >> 8, preferUppercase);
span[4] = ToHexDigit((toEncode & 0xf0) >> 4, preferUppercase);
span[5] = ToHexDigit(toEncode & 0xf, preferUppercase);
numberOfCharactersWritten = lengthNeeded;
return true;
}
public override bool WillEncode(int unicodeScalar)
{
return (unicodeScalar < char.MaxValue && IsAlwaysEscaped((char)unicodeScalar)) ||
inner.WillEncode(unicodeScalar);
}
public static bool HasTwoCharacterEscape(char value)
{
// RFC 8259, Section 7, "char = " BNF
switch (value)
{
case '"':
case '\\':
case '/':
case '\b':
case '\f':
case '\n':
case '\r':
case '\t':
return true;
default:
return false;
}
}
public static bool IsAlwaysEscaped(char value)
{
// RFC 8259, Section 7, first paragraph
return value == '"' || value == '\\' || value < '\u001f';
}
static char GetTwoCharacterEscapeSuffix(char value)
{
// RFC 8259, Section 7, "char = " BNF
switch (value)
{
case '"':
return '"';
case '\\':
return '\\';
case '/':
return '/';
case '\b':
return 'b';
case '\f':
return 'f';
case '\n':
return 'n';
case '\r':
return 'r';
case '\t':
return 't';
default:
throw new ArgumentOutOfRangeException(nameof(value));
}
}
static char ToHexDigit(int value, bool preferUppercase)
{
if (value > 0xf)
{
throw new ArgumentOutOfRangeException(nameof(value));
}
if (value <= 10)
{
return (char)(value + '0');
}
else
{
return (char)(value - 0xa + (preferUppercase ? 'A' : 'a'));
}
}
} API Usageusing System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
class MinimalJsonEncoder : JsonEncoder
{
public override int FindFirstCharacterToEncode(Span<char> text)
{
return -1;
}
public override JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar)
{
return default;
}
public override bool WillEncode(int unicodeScalar)
{
return false;
}
}
class MaximalJsonEncoder : JsonEncoder
{
public override int FindFirstCharacterToEncode(Span<char> text)
{
if (text.Length == 0)
{
return -1;
}
return 0;
}
public override JsonEncodeStyle EncodeUnicodeScalar(int unicodeScalar)
{
return default;
}
public override bool WillEncode(int unicodeScalar)
{
return true;
}
}
// Or similarly to choose some in-between set of characters to encode.
class Program
{
static void Main()
{
Console.OutputEncoding = Encoding.UTF8;
List<string> data = new List<string>();
data.Add("English");
data.Add("日本語");
data.Add("עברית");
data.Add("ქართული ენა");
data.Add("中文");
data.Add("𑄌𑄋𑄴𑄟𑄳𑄦 𑄃𑄧𑄏𑄛𑄖𑄴");
string actual = JsonSerializer.Serialize(data, new JsonSerializerOptions { Encoder =
new JavaScriptJsonEncoderAdapter(new MinimalJsonEncoder()) });
Console.WriteLine(actual);
}
} Alternative DesignsThe adapter above ensures the JSON output is always valid, at the expense of having both FindFirstCharacterToEncode and WillEncode both do some extra processing, to determine if a character must be encoded in order for the JSON to be valid. A less-safe but higher-performance alternative would be to require every JsonEncoder implementation to escape the required characters. (So the correct MinimalJsonEncoder usage above would have to encode ", \, and control characters.) RisksNo response
|
Superficially it seems like you could achieve the same result by subclassing |
Yes, this proposal is different from #42847 - that bug is around what points are in the list to encode (or not). This proposal is around how to implement such choices. This could in theory be achieved with a subclass of JavascriptEncoder, but that would either seal all the methods inherited from that type or lose the benefits of having a strategy pattern just for the decisions that are valid in the JSON space. |
Closing in favor of #87153. |
Background and motivation
Encoding JSON today is unnecessarily complicated in multiple ways:
A better use of the Strategy Pattern here allows a JSON encoder to decide only within the range of actual valid options for JSON encoding - whether to encode a character, and, if so, which of a limited number of valid encoding variants to use.
API Proposal
To integrate with the existing JavaScriptEncoder, code like this could be used, though ideally the JsonSerializer could also accept the new encoder type directly (perhaps Encoder = vs JsonEncoder = on JsonSerializerOptions, though that's a separate question):
API Usage
Alternative Designs
The adapter above ensures the JSON output is always valid, at the expense of having both FindFirstCharacterToEncode and WillEncode both do some extra processing, to determine if a character must be encoded in order for the JSON to be valid.
A less-safe but higher-performance alternative would be to require every JsonEncoder implementation to escape the required characters. (So the correct MinimalJsonEncoder usage above would have to encode ", \, and control characters.)
Risks
No response
The text was updated successfully, but these errors were encountered: