-
Notifications
You must be signed in to change notification settings - Fork 54
/
nsUnicodeProperties.h
279 lines (222 loc) · 7.91 KB
/
nsUnicodeProperties.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=4 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef NS_UNICODEPROPERTIES_H
#define NS_UNICODEPROPERTIES_H
#include "nsBidiUtils.h"
#include "nsUGenCategory.h"
#include "nsUnicodeScriptCodes.h"
#include "harfbuzz/hb.h"
#include "unicode/uchar.h"
#include "unicode/uscript.h"
const nsCharProps2& GetCharProps2(uint32_t aCh);
namespace mozilla {
namespace unicode {
extern const nsUGenCategory sDetailedToGeneralCategory[];
/* This MUST match the values assigned by genUnicodePropertyData.pl! */
enum VerticalOrientation {
VERTICAL_ORIENTATION_U = 0,
VERTICAL_ORIENTATION_R = 1,
VERTICAL_ORIENTATION_Tu = 2,
VERTICAL_ORIENTATION_Tr = 3
};
/* This MUST match the values assigned by genUnicodePropertyData.pl! */
enum PairedBracketType {
PAIRED_BRACKET_TYPE_NONE = 0,
PAIRED_BRACKET_TYPE_OPEN = 1,
PAIRED_BRACKET_TYPE_CLOSE = 2
};
/* Flags for Unicode security IdentifierType.txt attributes. Only a subset
of these are currently checked by Gecko, so we only define flags for the
ones we need. */
enum IdentifierType {
IDTYPE_RESTRICTED = 0,
IDTYPE_ALLOWED = 1,
};
enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 };
const uint32_t kVariationSelector15 = 0xFE0E; // text presentation
const uint32_t kVariationSelector16 = 0xFE0F; // emoji presentation
extern const hb_unicode_general_category_t sICUtoHBcategory[];
inline uint32_t GetMirroredChar(uint32_t aCh) { return u_charMirror(aCh); }
inline bool HasMirroredChar(uint32_t aCh) { return u_isMirrored(aCh); }
inline uint8_t GetCombiningClass(uint32_t aCh) {
return u_getCombiningClass(aCh);
}
inline uint8_t GetGeneralCategory(uint32_t aCh) {
return sICUtoHBcategory[u_charType(aCh)];
}
inline nsCharType GetBidiCat(uint32_t aCh) {
return nsCharType(u_charDirection(aCh));
}
inline int8_t GetNumericValue(uint32_t aCh) {
UNumericType type =
UNumericType(u_getIntPropertyValue(aCh, UCHAR_NUMERIC_TYPE));
return type == U_NT_DECIMAL || type == U_NT_DIGIT
? int8_t(u_getNumericValue(aCh))
: -1;
}
inline uint8_t GetLineBreakClass(uint32_t aCh) {
return u_getIntPropertyValue(aCh, UCHAR_LINE_BREAK);
}
inline Script GetScriptCode(uint32_t aCh) {
UErrorCode err = U_ZERO_ERROR;
return Script(uscript_getScript(aCh, &err));
}
inline bool HasScript(uint32_t aCh, Script aScript) {
return uscript_hasScript(aCh, UScriptCode(aScript));
}
inline uint32_t GetScriptTagForCode(Script aScriptCode) {
const char* tag = uscript_getShortName(UScriptCode(aScriptCode));
if (tag) {
return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
}
// return UNKNOWN script tag (running with older ICU?)
return HB_SCRIPT_UNKNOWN;
}
inline PairedBracketType GetPairedBracketType(uint32_t aCh) {
return PairedBracketType(
u_getIntPropertyValue(aCh, UCHAR_BIDI_PAIRED_BRACKET_TYPE));
}
inline uint32_t GetPairedBracket(uint32_t aCh) {
return u_getBidiPairedBracket(aCh);
}
inline uint32_t GetUppercase(uint32_t aCh) { return u_toupper(aCh); }
inline uint32_t GetLowercase(uint32_t aCh) { return u_tolower(aCh); }
inline uint32_t GetTitlecaseForLower(
uint32_t aCh) // maps LC to titlecase, UC unchanged
{
return u_isULowercase(aCh) ? u_totitle(aCh) : aCh;
}
inline uint32_t GetTitlecaseForAll(
uint32_t aCh) // maps both UC and LC to titlecase
{
return u_totitle(aCh);
}
inline uint32_t GetFoldedcase(uint32_t aCh) {
// Handle dotted capital I and dotless small i specially because we want to
// use a combination of ordinary case-folding rules and Turkish case-folding
// rules.
if (aCh == 0x0130 || aCh == 0x0131) {
return 'i';
}
return u_foldCase(aCh, U_FOLD_CASE_DEFAULT);
}
inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) {
switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) {
case U_EA_FULLWIDTH:
case U_EA_HALFWIDTH:
return true;
case U_EA_WIDE:
return u_hasBinaryProperty(aCh, UCHAR_EMOJI) ? false : true;
case U_EA_AMBIGUOUS:
case U_EA_NARROW:
case U_EA_NEUTRAL:
return false;
}
return false;
}
inline bool IsEastAsianWidthAFW(uint32_t aCh) {
switch (u_getIntPropertyValue(aCh, UCHAR_EAST_ASIAN_WIDTH)) {
case U_EA_AMBIGUOUS:
case U_EA_FULLWIDTH:
case U_EA_WIDE:
return true;
case U_EA_HALFWIDTH:
case U_EA_NARROW:
case U_EA_NEUTRAL:
return false;
}
return false;
}
inline bool IsDefaultIgnorable(uint32_t aCh) {
return u_hasBinaryProperty(aCh, UCHAR_DEFAULT_IGNORABLE_CODE_POINT);
}
inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) {
if (!u_hasBinaryProperty(aCh, UCHAR_EMOJI)) {
return TextOnly;
}
if (u_hasBinaryProperty(aCh, UCHAR_EMOJI_PRESENTATION)) {
return EmojiDefault;
}
return TextDefault;
}
// returns the simplified Gen Category as defined in nsUGenCategory
inline nsUGenCategory GetGenCategory(uint32_t aCh) {
return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
}
inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
return VerticalOrientation(GetCharProps2(aCh).mVertOrient);
}
inline IdentifierType GetIdentifierType(uint32_t aCh) {
return IdentifierType(GetCharProps2(aCh).mIdType);
}
uint32_t GetFullWidth(uint32_t aCh);
// This is the reverse function of GetFullWidth which guarantees that
// for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c.
// Note that, this function does not guarantee to convert all wide
// form characters to their possible narrow form.
uint32_t GetFullWidthInverse(uint32_t aCh);
bool IsClusterExtender(uint32_t aCh, uint8_t aCategory);
inline bool IsClusterExtender(uint32_t aCh) {
return IsClusterExtender(aCh, GetGeneralCategory(aCh));
}
// A simple iterator for a string of char16_t codepoints that advances
// by Unicode grapheme clusters
class ClusterIterator {
public:
ClusterIterator(const char16_t* aText, uint32_t aLength)
: mPos(aText),
mLimit(aText + aLength)
#ifdef DEBUG
,
mText(aText)
#endif
{
}
operator const char16_t*() const { return mPos; }
bool AtEnd() const { return mPos >= mLimit; }
void Next();
private:
const char16_t* mPos;
const char16_t* mLimit;
#ifdef DEBUG
const char16_t* mText;
#endif
};
// Count the number of grapheme clusters in the given string
uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength);
// Determine whether a character is a "combining diacritic" for the purpose
// of diacritic-insensitive text search. Examples of such characters include
// European accents and Hebrew niqqud, but not Hangul components or Thaana
// vowels, even though Thaana vowels are combining nonspacing marks that could
// be considered diacritics.
// As an exception to strictly following Unicode properties, we exclude the
// Japanese kana voicing marks
// 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
// 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
// which users report should not be ignored (bug 1624244).
inline bool IsCombiningDiacritic(uint32_t aCh) {
uint8_t cc = u_getCombiningClass(aCh);
return cc != HB_UNICODE_COMBINING_CLASS_NOT_REORDERED &&
cc != HB_UNICODE_COMBINING_CLASS_KANA_VOICING;
}
// Remove diacritics from a character
uint32_t GetNaked(uint32_t aCh);
// A simple reverse iterator for a string of char16_t codepoints that
// advances by Unicode grapheme clusters
class ClusterReverseIterator {
public:
ClusterReverseIterator(const char16_t* aText, uint32_t aLength)
: mPos(aText + aLength), mLimit(aText) {}
operator const char16_t*() const { return mPos; }
bool AtEnd() const { return mPos <= mLimit; }
void Next();
private:
const char16_t* mPos;
const char16_t* mLimit;
};
} // end namespace unicode
} // end namespace mozilla
#endif /* NS_UNICODEPROPERTIES_H */