/
CharacterClassConstructor.cpp
257 lines (228 loc) · 11.1 KB
/
CharacterClassConstructor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/*
* Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "CharacterClassConstructor.h"
#if ENABLE(WREC)
#include "pcre_internal.h"
#include <wtf/ASCIICType.h>
using namespace WTF;
namespace JSC { namespace WREC {
void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
{
unsigned pos = 0;
unsigned range = matches.size();
// binary chop, find position to insert char.
while (range) {
unsigned index = range >> 1;
int val = matches[pos+index] - ch;
if (!val)
return;
else if (val > 0)
range = index;
else {
pos += (index+1);
range -= (index+1);
}
}
if (pos == matches.size())
matches.append(ch);
else
matches.insert(pos, ch);
}
void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
{
unsigned end = ranges.size();
// Simple linear scan - I doubt there are that many ranges anyway...
// feel free to fix this with something faster (eg binary chop).
for (unsigned i = 0; i < end; ++i) {
// does the new range fall before the current position in the array
if (hi < ranges[i].begin) {
// optional optimization: concatenate appending ranges? - may not be worthwhile.
if (hi == (ranges[i].begin - 1)) {
ranges[i].begin = lo;
return;
}
CharacterRange r = {lo, hi};
ranges.insert(i, r);
return;
}
// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
// end of the last range they concatenate, which is just as good.
if (lo <= (ranges[i].end + 1)) {
// found an intersect! we'll replace this entry in the array.
ranges[i].begin = std::min(ranges[i].begin, lo);
ranges[i].end = std::max(ranges[i].end, hi);
// now check if the new range can subsume any subsequent ranges.
unsigned next = i+1;
// each iteration of the loop we will either remove something from the list, or break the loop.
while (next < ranges.size()) {
if (ranges[next].begin <= (ranges[i].end + 1)) {
// the next entry now overlaps / concatenates this one.
ranges[i].end = std::max(ranges[i].end, ranges[next].end);
ranges.remove(next);
} else
break;
}
return;
}
}
// CharacterRange comes after all existing ranges.
CharacterRange r = {lo, hi};
ranges.append(r);
}
void CharacterClassConstructor::put(UChar ch)
{
// Parsing a regular expression like [a-z], we start in an initial empty state:
// ((m_charBuffer == -1) && !m_isPendingDash)
// When buffer the 'a' sice it may be (and is in this case) part of a range:
// ((m_charBuffer != -1) && !m_isPendingDash)
// Having parsed the hyphen we then record that the dash is also pending:
// ((m_charBuffer != -1) && m_isPendingDash)
// The next change will always take us back to the initial state - either because
// a complete range has been parsed (such as [a-z]), or because a flush is forced,
// due to an early end in the regexp ([a-]), or a character class escape being added
// ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
if (m_charBuffer != -1) {
if (m_isPendingDash) {
// EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
UChar lo = m_charBuffer;
UChar hi = ch;
// Reset back to the inital state.
m_charBuffer = -1;
m_isPendingDash = false;
// This is an error, detected lazily. Do not proceed.
if (lo > hi) {
m_isUpsideDown = true;
return;
}
if (lo <= 0x7f) {
char asciiLo = lo;
char asciiHi = std::min(hi, (UChar)0x7f);
addSortedRange(m_ranges, lo, asciiHi);
if (m_isCaseInsensitive) {
if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
if ((asciiLo <= 'z') && (asciiHi >= 'a'))
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
if (hi >= 0x80) {
UChar unicodeCurr = std::max(lo, (UChar)0x80);
addSortedRange(m_rangesUnicode, unicodeCurr, hi);
if (m_isCaseInsensitive) {
// we're going to scan along, updating the start of the range
while (unicodeCurr <= hi) {
// Spin forwards over any characters that don't have two cases.
for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
// if this was the last character in the range, we're done.
if (unicodeCurr == hi)
return;
}
// if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
UChar rangeStart = unicodeCurr;
UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
// If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,
// and if it's other case value is one greater then the othercase value for the current last
// character included in the range, we can include next into the range.
while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
// increment unicodeCurr; it points to the end of the range.
// increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
++unicodeCurr;
++otherCurr;
}
// otherChar is the last in the range of other case chars, calculate offset to get back to the start.
addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
// unicodeCurr has been added, move on to the next char.
++unicodeCurr;
}
}
}
} else if (ch == '-')
// EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
m_isPendingDash = true;
else {
// EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
flush();
m_charBuffer = ch;
}
} else
// EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
// (the hyphen not treated as a special character in this case, same handling for any char).
m_charBuffer = ch;
}
// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
// array (either ascii or unicode).
// If the pattern is case insensitive we add entries for both cases.
void CharacterClassConstructor::flush()
{
if (m_charBuffer != -1) {
if (m_charBuffer <= 0x7f) {
if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
addSorted(m_matches, toASCIIUpper(m_charBuffer));
addSorted(m_matches, m_charBuffer);
if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
addSorted(m_matches, toASCIILower(m_charBuffer));
} else {
addSorted(m_matchesUnicode, m_charBuffer);
if (m_isCaseInsensitive) {
int other = jsc_pcre_ucp_othercase(m_charBuffer);
if (other != -1)
addSorted(m_matchesUnicode, other);
}
}
m_charBuffer = -1;
}
if (m_isPendingDash) {
addSorted(m_matches, '-');
m_isPendingDash = false;
}
}
void CharacterClassConstructor::append(const CharacterClass& other)
{
// [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
// Need to check the spec, really, but think this matches PCRE behaviour.
flush();
if (other.numMatches) {
for (size_t i = 0; i < other.numMatches; ++i)
addSorted(m_matches, other.matches[i]);
}
if (other.numRanges) {
for (size_t i = 0; i < other.numRanges; ++i)
addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
}
if (other.numMatchesUnicode) {
for (size_t i = 0; i < other.numMatchesUnicode; ++i)
addSorted(m_matchesUnicode, other.matchesUnicode[i]);
}
if (other.numRangesUnicode) {
for (size_t i = 0; i < other.numRangesUnicode; ++i)
addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
}
}
} } // namespace JSC::WREC
#endif // ENABLE(WREC)