-
Notifications
You must be signed in to change notification settings - Fork 4.6k
/
pal_casing.m
183 lines (169 loc) · 7.41 KB
/
pal_casing.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
#include "pal_icushim_internal.h"
#include "pal_casing.h"
#include "pal_errors.h"
#import <Foundation/Foundation.h>
#if !__has_feature(objc_arc)
#error This file relies on ARC for memory management, but ARC is not enabled.
#endif
#if defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS)
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return true or false
*/
#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return true or false
*/
#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00)
/**
* Get a code point index from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case for casing will be read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate.
*
* @param s const uint16_t* string
* @param i output string offset, must be i<length
* @param length string length
*/
#define NEXTOFFSET(s, i, length) { \
uint16_t c = (s)[(i)++]; \
if (IS_LEAD(c)) { \
uint16_t __c2; \
if ((i) != (length) && IS_TRAIL(__c2 = (s)[(i)])) { \
++(i); \
} \
} \
}
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* Converts code points outside of Basic Multilingual Plane into
* corresponding surrogate pairs if sufficient space in the string.
* High surrogate range: 0xD800 - 0xDBFF
* Low surrogate range: 0xDC00 - 0xDFFF
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to true.
*
* @param buffer const uint16_t * string buffer
* @param offset string offset, must be offset<capacity
* @param capacity size of the string buffer
* @param codePoint code point to append
* @param isError output bool set to true if an error occurs, otherwise not modified
*/
#define Append(buffer, offset, capacity, codePoint, isError) { \
if ((offset) >= (capacity)) /* insufficiently sized destination buffer */ { \
(isError) = InsufficientBuffer; \
} else if ((uint32_t)(codePoint) > 0x10ffff) /* invalid code point */ { \
(isError) = InvalidCodePoint; \
} else if ((uint32_t)(codePoint) <= 0xffff) { \
(buffer)[(offset)++] = (uint16_t)(codePoint); \
} else { \
(buffer)[(offset)++] = (uint16_t)(((codePoint) >> 10) + 0xd7c0); \
(buffer)[(offset)++] = (uint16_t)(((codePoint)&0x3ff) | 0xdc00); \
} \
}
/*
Function:
ChangeCaseNative
Performs upper or lower casing of a string into a new buffer, taking into account the specified locale.
Two things we are considering here:
1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
(i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
Returns 0 for success, non-zero on failure see ErrorCodes.
*/
int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t lNameLength,
const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
{
@autoreleasepool
{
NSLocale *currentLocale;
if(localeName == NULL || lNameLength == 0)
{
currentLocale = [NSLocale systemLocale];
}
else
{
NSString *locName = [NSString stringWithCharacters: localeName length: lNameLength];
currentLocale = [NSLocale localeWithLocaleIdentifier:locName];
}
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
uint16_t dstCodepoint;
while (srcIdx < cwSrcLength)
{
int32_t startIndex = srcIdx;
NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
int32_t srcLength = srcIdx - startIndex;
NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
NSString *dst = bToUpper ? [src uppercaseStringWithLocale:currentLocale] : [src lowercaseStringWithLocale:currentLocale];
int32_t index = 0;
// iterate over all code points of a surrogate pair character
while (index < srcLength)
{
// the dst.length > srcLength is to prevent code point expansions
dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
index++;
}
if (isError)
return isError;
}
return Success;
}
}
/*
Function:
ChangeCaseInvariantNative
Performs upper or lower casing of a string into a new buffer.
Two things we are considering here:
1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
(i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
Returns 0 for success, non-zero on failure see ErrorCodes.
*/
int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
{
@autoreleasepool
{
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
uint16_t dstCodepoint;
while (srcIdx < cwSrcLength)
{
int32_t startIndex = srcIdx;
NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
int32_t srcLength = srcIdx - startIndex;
NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString;
int32_t index = 0;
// iterate over all code points of a surrogate pair character
while (index < srcLength)
{
// the dst.length > srcLength is to prevent code point expansions
dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
index++;
}
if (isError)
return isError;
}
return Success;
}
}
#endif