/
CharUnicodeInfo.cs
515 lines (419 loc) · 22 KB
/
CharUnicodeInfo.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Buffers.Binary;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.Unicode;
namespace System.Globalization
{
/// <summary>
/// This class implements a set of methods for retrieving character type
/// information. Character type information is independent of culture
/// and region.
/// </summary>
public static partial class CharUnicodeInfo
{
internal const char HIGH_SURROGATE_START = '\ud800';
internal const char HIGH_SURROGATE_END = '\udbff';
internal const char LOW_SURROGATE_START = '\udc00';
internal const char LOW_SURROGATE_END = '\udfff';
internal const int HIGH_SURROGATE_RANGE = 0x3FF;
internal const int UNICODE_CATEGORY_OFFSET = 0;
internal const int BIDI_CATEGORY_OFFSET = 1;
// The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
internal const int UNICODE_PLANE01_START = 0x10000;
/*
* GetBidiCategory
* ===============
* Data derived from https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types. This data
* is encoded in DerivedBidiClass.txt. We map "L" to "strong left-to-right"; and we map "R" and "AL"
* to "strong right-to-left". All other (non-strong) code points are "other" for our purposes.
*/
internal static StrongBidiCategory GetBidiCategory(string s, int index)
{
if (s is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.s);
}
if ((uint)index >= (uint)s.Length)
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}
return GetBidiCategoryNoBoundsChecks((uint)GetCodePointFromString(s, index));
}
internal static StrongBidiCategory GetBidiCategory(StringBuilder s, int index)
{
Debug.Assert(s != null, "s != null");
Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
// The logic below follows Table 3-5 in the Unicode Standard, Sec. 3.9.
// First char (high surrogate) = 110110wwwwxxxxxx
// Second char (low surrogate) = 110111xxxxxxxxxx
int c = (int)s[index];
if (index < s.Length - 1)
{
int temp1 = c - HIGH_SURROGATE_START; // temp1 = 000000wwwwxxxxxx
if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
{
int temp2 = (int)s[index + 1] - LOW_SURROGATE_START; // temp2 = 000000xxxxxxxxxx
if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
{
// |--------temp1--||-temp2--|
// 00000uuuuuuxxxxxxxxxxxxxxxx (where uuuuu = wwww + 1)
c = (temp1 << 10) + temp2 + UNICODE_PLANE01_START;
}
}
}
return GetBidiCategoryNoBoundsChecks((uint)c);
}
private static StrongBidiCategory GetBidiCategoryNoBoundsChecks(uint codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);
// Each entry of the 'CategoryValues' table uses bits 5 - 6 to store the strong bidi information.
StrongBidiCategory bidiCategory = (StrongBidiCategory)(Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoriesValues), offset) & 0b_0110_0000);
Debug.Assert(bidiCategory == StrongBidiCategory.Other || bidiCategory == StrongBidiCategory.StrongLeftToRight || bidiCategory == StrongBidiCategory.StrongRightToLeft, "Unknown StrongBidiCategory value.");
return bidiCategory;
}
/*
* GetDecimalDigitValue
* ====================
* Data derived from https://www.unicode.org/reports/tr44/#UnicodeData.txt. If Numeric_Type=Decimal,
* then retrieves the Numeric_Value (0..9) for this code point. If Numeric_Type!=Decimal, returns -1.
* This data is encoded in field 6 of UnicodeData.txt.
*/
public static int GetDecimalDigitValue(char ch)
{
return GetDecimalDigitValueInternalNoBoundsCheck(ch);
}
public static int GetDecimalDigitValue(string s, int index)
{
if (s is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.s);
}
if ((uint)index >= (uint)s.Length)
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}
return GetDecimalDigitValueInternalNoBoundsCheck((uint)GetCodePointFromString(s, index));
}
private static int GetDecimalDigitValueInternalNoBoundsCheck(uint codePoint)
{
nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks(codePoint);
uint rawValue = Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(DigitValues), offset);
return (int)(rawValue >> 4) - 1; // return the high nibble of the result, minus 1 so that "not a decimal digit value" gets normalized to -1
}
/*
* GetDigitValue
* =============
* Data derived from https://www.unicode.org/reports/tr44/#UnicodeData.txt. If Numeric_Type=Decimal
* or Numeric_Type=Digit, then retrieves the Numeric_Value (0..9) for this code point. Otherwise
* returns -1. This data is encoded in field 7 of UnicodeData.txt.
*/
public static int GetDigitValue(char ch)
{
return GetDigitValueInternalNoBoundsCheck(ch);
}
public static int GetDigitValue(string s, int index)
{
if (s is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.s);
}
if ((uint)index >= (uint)s.Length)
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}
return GetDigitValueInternalNoBoundsCheck((uint)GetCodePointFromString(s, index));
}
private static int GetDigitValueInternalNoBoundsCheck(uint codePoint)
{
nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks(codePoint);
int rawValue = Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(DigitValues), offset);
return (rawValue & 0xF) - 1; // return the low nibble of the result, minus 1 so that "not a digit value" gets normalized to -1
}
/*
* GetGraphemeBreakClusterType
* ===========================
* Data derived from https://unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. Represents
* grapheme cluster boundary information for the given code point.
*/
internal static GraphemeClusterBreakType GetGraphemeClusterBreakType(Rune rune)
{
nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks((uint)rune.Value);
return (GraphemeClusterBreakType)Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(GraphemeSegmentationValues), offset);
}
/*
* GetIsWhiteSpace
* ===========================
* Data derived from https://unicode.org/reports/tr44/#White_Space. Represents whether a code point
* is listed as White_Space per PropList.txt.
*/
internal static bool GetIsWhiteSpace(char ch)
{
// We don't need a (string, int) overload because all current white space chars are in the BMP.
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(ch);
// High bit of each value in the 'CategoriesValues' array denotes whether this code point is white space.
return (sbyte)Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoriesValues), offset) < 0;
}
/*
* GetNumericValue
* ===============
* Data derived from https://www.unicode.org/reports/tr44/#UnicodeData.txt. If Numeric_Type=Decimal
* or Numeric_Type=Digit or Numeric_Type=Numeric, then retrieves the Numeric_Value for this code point.
* Otherwise returns -1. This data is encoded in field 8 of UnicodeData.txt.
*/
public static double GetNumericValue(char ch)
{
return GetNumericValueNoBoundsCheck(ch);
}
internal static double GetNumericValue(int codePoint)
{
if (!UnicodeUtility.IsValidCodePoint((uint)codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}
return GetNumericValueNoBoundsCheck((uint)codePoint);
}
public static double GetNumericValue(string s, int index)
{
if (s is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.s);
}
if ((uint)index >= (uint)s.Length)
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}
return GetNumericValueInternal(s, index);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static double GetNumericValueInternal(string s, int index) => GetNumericValueNoBoundsCheck((uint)GetCodePointFromString(s, index));
private static double GetNumericValueNoBoundsCheck(uint codePoint)
{
nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks(codePoint);
ref byte refToValue = ref Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(NumericValues), offset * 8 /* sizeof(double) */);
// 'refToValue' points to a little-endian 64-bit double.
if (BitConverter.IsLittleEndian)
{
return Unsafe.ReadUnaligned<double>(ref refToValue);
}
else
{
ulong temp = Unsafe.ReadUnaligned<ulong>(ref refToValue);
temp = BinaryPrimitives.ReverseEndianness(temp);
return BitConverter.UInt64BitsToDouble(temp);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToUpper(char codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks((uint)codePoint);
// The offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add
ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(UppercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + codePoint);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static uint ToUpper(uint codePoint)
{
if (!UnicodeUtility.IsValidCodePoint(codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);
// The mapped casing for the codePoint usually exists in the same plane as codePoint.
// This is why we use 16-bit offsets to calculate the delta value from the codePoint.
ref ushort rsStart = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(UppercaseValues));
ref ushort rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
// We use the mask 0xFFFF0000u as we are sure the casing is in the same plane as codePoint.
return (codePoint & 0xFFFF0000u) | (ushort)((uint)delta + codePoint);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToLower(char codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks((uint)codePoint);
// The offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add
ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(LowercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + codePoint);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static uint ToLower(uint codePoint)
{
if (!UnicodeUtility.IsValidCodePoint(codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);
// The mapped casing for the codePoint usually exists in the same plane as codePoint.
// This is why we use 16-bit offsets to calculate the delta value from the codePoint.
ref ushort rsStart = ref Unsafe.As<byte, ushort>(ref MemoryMarshal.GetReference(LowercaseValues));
ref ushort rsDelta = ref Unsafe.Add(ref rsStart, (nint)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
// We use the mask 0xFFFF0000u as we are sure the casing is in the same plane as codePoint.
return (codePoint & 0xFFFF0000u) | (ushort)((uint)delta + codePoint);
}
/*
* GetUnicodeCategory
* ==================
* Data derived from https://www.unicode.org/reports/tr44/#UnicodeData.txt. Returns the
* General_Category of this code point as encoded in field 2 of UnicodeData.txt, or "Cn"
* if the code point has not been assigned.
*/
public static UnicodeCategory GetUnicodeCategory(char ch)
{
return GetUnicodeCategoryNoBoundsChecks(ch);
}
public static UnicodeCategory GetUnicodeCategory(int codePoint)
{
if (!UnicodeUtility.IsValidCodePoint((uint)codePoint))
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.codePoint);
}
return GetUnicodeCategoryNoBoundsChecks((uint)codePoint);
}
public static UnicodeCategory GetUnicodeCategory(string s, int index)
{
if (s is null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.s);
}
if ((uint)index >= (uint)s.Length)
{
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}
return GetUnicodeCategoryInternal(s, index);
}
/// <summary>
/// Similar to <see cref="GetUnicodeCategory(string, int)"/>, but skips argument checks.
/// For internal use only.
/// </summary>
internal static UnicodeCategory GetUnicodeCategoryInternal(string value, int index)
{
Debug.Assert(value != null, "value can not be null");
Debug.Assert(index < value.Length, "index < value.Length");
return GetUnicodeCategoryNoBoundsChecks((uint)GetCodePointFromString(value, index));
}
/// <summary>
/// Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
/// If the character is a valid surrogate pair, charLength will return 2.
/// </summary>
internal static UnicodeCategory GetUnicodeCategoryInternal(string str, int index, out int charLength)
{
Debug.Assert(str != null, "str can not be null");
Debug.Assert(str.Length > 0, "str.Length > 0");
Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
uint codePoint = (uint)GetCodePointFromString(str, index);
UnicodeDebug.AssertIsValidCodePoint(codePoint);
charLength = (codePoint >= UNICODE_PLANE01_START) ? 2 /* surrogate pair */ : 1 /* BMP char */;
return GetUnicodeCategoryNoBoundsChecks(codePoint);
}
private static UnicodeCategory GetUnicodeCategoryNoBoundsChecks(uint codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);
// Each entry of the 'CategoriesValues' table uses the low 5 bits to store the UnicodeCategory information.
return (UnicodeCategory)(Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoriesValues), offset) & 0x1F);
}
/*
* HELPER AND TABLE LOOKUP ROUTINES
*/
/// <summary>
/// Returns the code point pointed to by index, decoding any surrogate sequence if possible.
/// This is similar to char.ConvertToUTF32, but the difference is that
/// it does not throw exceptions when invalid surrogate characters are passed in.
///
/// WARNING: since it doesn't throw an exception it CAN return a value
/// in the surrogate range D800-DFFF, which is not a legal scalar value.
/// </summary>
private static int GetCodePointFromString(string s, int index)
{
Debug.Assert(s != null, "s != null");
Debug.Assert((uint)index < (uint)s.Length, "index < s.Length");
int codePoint = 0;
// We know the 'if' block below will always succeed, but it allows the
// JIT to optimize the codegen of this method.
if ((uint)index < (uint)s.Length)
{
codePoint = s[index];
int temp1 = codePoint - HIGH_SURROGATE_START;
if ((uint)temp1 <= HIGH_SURROGATE_RANGE)
{
index++;
if ((uint)index < (uint)s.Length)
{
int temp2 = s[index] - LOW_SURROGATE_START;
if ((uint)temp2 <= HIGH_SURROGATE_RANGE)
{
// Combine these surrogate code points into a supplementary code point
codePoint = (temp1 << 10) + temp2 + UNICODE_PLANE01_START;
}
}
}
}
return codePoint;
}
/// <summary>
/// Retrieves the offset into the "CategoryCasing" arrays where this code point's
/// information is stored. Used for getting the Unicode category, bidi information,
/// and whitespace information.
/// </summary>
private static nuint GetCategoryCasingTableOffsetNoBoundsChecks(uint codePoint)
{
UnicodeDebug.AssertIsValidCodePoint(codePoint);
// The code below is written with the assumption that the backing store is 11:5:4.
AssertCategoryCasingTableLevels(11, 5, 4);
// Get the level index item from the high 11 bits of the code point.
uint index = Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoryCasingLevel1Index), codePoint >> 9);
// Get the level 2 WORD offset from the next 5 bits of the code point.
// This provides the base offset of the level 3 table.
// Note that & has lower precedence than +, so remember the parens.
ref byte level2Ref = ref Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoryCasingLevel2Index), (index << 6) + ((codePoint >> 3) & 0b_0011_1110));
if (BitConverter.IsLittleEndian)
{
index = Unsafe.ReadUnaligned<ushort>(ref level2Ref);
}
else
{
index = BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<ushort>(ref level2Ref));
}
// Get the result from the low 4 bits of the code point.
// This is the offset into the values table where the data is stored.
return Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(CategoryCasingLevel3Index), (index << 4) + (codePoint & 0x0F));
}
/// <summary>
/// Retrieves the offset into the "NumericGrapheme" arrays where this code point's
/// information is stored. Used for getting numeric information and grapheme boundary
/// information.
/// </summary>
private static nuint GetNumericGraphemeTableOffsetNoBoundsChecks(uint codePoint)
{
UnicodeDebug.AssertIsValidCodePoint(codePoint);
// The code below is written with the assumption that the backing store is 11:5:4.
AssertNumericGraphemeTableLevels(11, 5, 4);
// Get the level index item from the high 11 bits of the code point.
uint index = Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(NumericGraphemeLevel1Index), codePoint >> 9);
// Get the level 2 WORD offset from the next 5 bits of the code point.
// This provides the base offset of the level 3 table.
// Note that & has lower precedence than +, so remember the parens.
ref byte level2Ref = ref Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(NumericGraphemeLevel2Index), (index << 6) + ((codePoint >> 3) & 0b_0011_1110));
if (BitConverter.IsLittleEndian)
{
index = Unsafe.ReadUnaligned<ushort>(ref level2Ref);
}
else
{
index = BinaryPrimitives.ReverseEndianness(Unsafe.ReadUnaligned<ushort>(ref level2Ref));
}
// Get the result from the low 4 bits of the code point.
// This is the offset into the values table where the data is stored.
return Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(NumericGraphemeLevel3Index), (index << 4) + (codePoint & 0x0F));
}
}
}