-
Notifications
You must be signed in to change notification settings - Fork 0
/
_regex_core.py
4396 lines (3520 loc) · 136 KB
/
_regex_core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Secret Labs' Regular Expression Engine core module
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license. For any other use, please contact Secret Labs
# AB (info@pythonware.com).
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
# 2010-01-16 mrab Python front-end re-written and extended
import string
import sys
import unicodedata
from collections import defaultdict
import _regex
__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",
"F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",
"POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",
"V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",
"Scanner"]
# The regex exception.
class error(Exception):
def __init__(self, message, pattern=None, pos=None):
newline = u'\n' if isinstance(pattern, unicode) else '\n'
self.msg = message
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
message = "%s at position %d" % (message, pos)
if newline in pattern:
message += " (line %d, column %d)" % (self.lineno, self.colno)
Exception.__init__(self, message)
# The exception for when a positional flag has been turned on in the old
# behaviour.
class _UnscopedFlagSet(Exception):
pass
# The exception for when parsing fails and we want to try something else.
class ParseError(Exception):
pass
# The exception for when there isn't a valid first set.
class _FirstSetError(Exception):
pass
# Flags.
A = ASCII = 0x80 # Assume ASCII locale.
B = BESTMATCH = 0x1000 # Best fuzzy match.
D = DEBUG = 0x200 # Print parsed pattern.
E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first
# fuzzy match.
F = FULLCASE = 0x4000 # Unicode full case-folding.
I = IGNORECASE = 0x2 # Ignore case.
L = LOCALE = 0x4 # Assume current 8-bit locale.
M = MULTILINE = 0x8 # Make anchors look for newline.
P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).
R = REVERSE = 0x400 # Search backwards.
S = DOTALL = 0x10 # Make dot match newline.
U = UNICODE = 0x20 # Assume Unicode locale.
V0 = VERSION0 = 0x2000 # Old legacy behaviour.
V1 = VERSION1 = 0x100 # New enhanced behaviour.
W = WORD = 0x800 # Default Unicode word breaks.
X = VERBOSE = 0x40 # Ignore whitespace and comments.
T = TEMPLATE = 0x1 # Template (present because re module has it).
DEFAULT_VERSION = VERSION1
_ALL_VERSIONS = VERSION0 | VERSION1
_ALL_ENCODINGS = ASCII | LOCALE | UNICODE
# The default flags for the various versions.
DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}
# The mask for the flags.
GLOBAL_FLAGS = (_ALL_ENCODINGS | _ALL_VERSIONS | BESTMATCH | DEBUG |
ENHANCEMATCH | POSIX | REVERSE)
SCOPED_FLAGS = FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE
ALPHA = frozenset(string.ascii_letters)
DIGITS = frozenset(string.digits)
ALNUM = ALPHA | DIGITS
OCT_DIGITS = frozenset(string.octdigits)
HEX_DIGITS = frozenset(string.hexdigits)
SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])
NAMED_CHAR_PART = ALNUM | frozenset(" -")
PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")
SET_OPS = ("||", "~~", "&&", "--")
# The width of the code words inside the regex engine.
BYTES_PER_CODE = _regex.get_code_size()
BITS_PER_CODE = BYTES_PER_CODE * 8
# The repeat count which represents infinity.
UNLIMITED = (1 << BITS_PER_CODE) - 1
# The regular expression flags.
REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,
"i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,
"s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":
VERBOSE}
# The case flags.
CASE_FLAGS = FULLCASE | IGNORECASE
NOCASE = 0
FULLIGNORECASE = FULLCASE | IGNORECASE
FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE
CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,
FULLIGNORECASE: FULLIGNORECASE}
# The number of digits in hexadecimal escapes.
HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}
# A singleton which indicates a comment within a pattern.
COMMENT = object()
FLAGS = object()
# The names of the opcodes.
OPCODES = """
FAILURE
SUCCESS
ANY
ANY_ALL
ANY_ALL_REV
ANY_REV
ANY_U
ANY_U_REV
ATOMIC
BOUNDARY
BRANCH
CALL_REF
CHARACTER
CHARACTER_IGN
CHARACTER_IGN_REV
CHARACTER_REV
CONDITIONAL
DEFAULT_BOUNDARY
DEFAULT_END_OF_WORD
DEFAULT_START_OF_WORD
END
END_OF_LINE
END_OF_LINE_U
END_OF_STRING
END_OF_STRING_LINE
END_OF_STRING_LINE_U
END_OF_WORD
FUZZY
GRAPHEME_BOUNDARY
GREEDY_REPEAT
GROUP
GROUP_CALL
GROUP_EXISTS
KEEP
LAZY_REPEAT
LOOKAROUND
NEXT
PROPERTY
PROPERTY_IGN
PROPERTY_IGN_REV
PROPERTY_REV
PRUNE
RANGE
RANGE_IGN
RANGE_IGN_REV
RANGE_REV
REF_GROUP
REF_GROUP_FLD
REF_GROUP_FLD_REV
REF_GROUP_IGN
REF_GROUP_IGN_REV
REF_GROUP_REV
SEARCH_ANCHOR
SET_DIFF
SET_DIFF_IGN
SET_DIFF_IGN_REV
SET_DIFF_REV
SET_INTER
SET_INTER_IGN
SET_INTER_IGN_REV
SET_INTER_REV
SET_SYM_DIFF
SET_SYM_DIFF_IGN
SET_SYM_DIFF_IGN_REV
SET_SYM_DIFF_REV
SET_UNION
SET_UNION_IGN
SET_UNION_IGN_REV
SET_UNION_REV
SKIP
START_OF_LINE
START_OF_LINE_U
START_OF_STRING
START_OF_WORD
STRING
STRING_FLD
STRING_FLD_REV
STRING_IGN
STRING_IGN_REV
STRING_REV
STRING_SET
STRING_SET_FLD
STRING_SET_FLD_REV
STRING_SET_IGN
STRING_SET_IGN_REV
STRING_SET_REV
"""
# Define the opcodes in a namespace.
class Namespace(object):
pass
OP = Namespace()
for i, op in enumerate(OPCODES.split()):
setattr(OP, op, i)
def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):
"""Make room in the given cache.
Args:
cache_dict: The cache dictionary to modify.
args_dict: The dictionary of named list args used by patterns.
max_length: Maximum # of entries in cache_dict before it is shrunk.
divisor: Cache will shrink to max_length - 1/divisor*max_length items.
"""
# Toss out a fraction of the entries at random to make room for new ones.
# A random algorithm was chosen as opposed to simply cache_dict.popitem()
# as popitem could penalize the same regular expression repeatedly based
# on its internal hash value. Being random should spread the cache miss
# love around.
cache_keys = tuple(cache_dict.keys())
overage = len(cache_keys) - max_length
if overage < 0:
# Cache is already within limits. Normally this should not happen
# but it could due to multithreading.
return
number_to_toss = max_length // divisor + overage
# The import is done here to avoid a circular dependency.
import random
if not hasattr(random, 'sample'):
# Do nothing while resolving the circular dependency:
# re->random->warnings->tokenize->string->re
return
for doomed_key in random.sample(cache_keys, number_to_toss):
try:
del cache_dict[doomed_key]
except KeyError:
# Ignore problems if the cache changed from another thread.
pass
# Rebuild the arguments and locale-sensitivity dictionaries.
args_dict.clear()
sensitivity_dict = {}
for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):
args_dict[pattern, pattern_type, flags, default_version, locale] = args
try:
sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]
except KeyError:
pass
locale_sensitive.clear()
locale_sensitive.update(sensitivity_dict)
def _fold_case(info, string):
"Folds the case of a string."
flags = info.flags
if (flags & _ALL_ENCODINGS) == 0:
flags |= info.guess_encoding
return _regex.fold_case(flags, string)
def is_cased(info, char):
"Checks whether a character is cased."
return len(_regex.get_all_cases(info.flags, char)) > 1
def _compile_firstset(info, fs):
"Compiles the firstset for the pattern."
reverse = bool(info.flags & REVERSE)
fs = _check_firstset(info, reverse, fs)
if not fs:
return []
# Compile the firstset.
return fs.compile(reverse)
def _check_firstset(info, reverse, fs):
"Checks the firstset for the pattern."
if not fs or None in fs:
return None
# If we ignore the case, for simplicity we won't build a firstset.
members = set()
case_flags = NOCASE
for i in fs:
if isinstance(i, Character) and not i.positive:
return None
# if i.case_flags:
# if isinstance(i, Character):
# if is_cased(info, i.value):
# return []
# elif isinstance(i, SetBase):
# return []
case_flags |= i.case_flags
members.add(i.with_flags(case_flags=NOCASE))
if case_flags == (FULLCASE | IGNORECASE):
return None
# Build the firstset.
fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,
zerowidth=True)
fs = fs.optimise(info, reverse, in_set=True)
return fs
def _flatten_code(code):
"Flattens the code from a list of tuples."
flat_code = []
for c in code:
flat_code.extend(c)
return flat_code
def make_character(info, value, in_set=False):
"Makes a character literal."
if in_set:
# A character set is built case-sensitively.
return Character(value)
return Character(value, case_flags=info.flags & CASE_FLAGS)
def make_ref_group(info, name, position):
"Makes a group reference."
return RefGroup(info, name, position, case_flags=info.flags & CASE_FLAGS)
def make_string_set(info, name):
"Makes a string set."
return StringSet(info, name, case_flags=info.flags & CASE_FLAGS)
def make_property(info, prop, in_set):
"Makes a property."
if in_set:
return prop
return prop.with_flags(case_flags=info.flags & CASE_FLAGS)
def _parse_pattern(source, info):
"Parses a pattern, eg. 'a|b|c'."
branches = [parse_sequence(source, info)]
while source.match("|"):
branches.append(parse_sequence(source, info))
if len(branches) == 1:
return branches[0]
return Branch(branches)
def parse_sequence(source, info):
"Parses a sequence, eg. 'abc'."
sequence = []
applied = False
while True:
# Get literal characters followed by an element.
characters, case_flags, element = parse_literal_and_element(source,
info)
if not element:
# No element, just a literal. We've also reached the end of the
# sequence.
append_literal(characters, case_flags, sequence)
break
if element is COMMENT or element is FLAGS:
append_literal(characters, case_flags, sequence)
elif type(element) is tuple:
# It looks like we've found a quantifier.
ch, saved_pos = element
counts = parse_quantifier(source, info, ch)
if counts:
# It _is_ a quantifier.
apply_quantifier(source, info, counts, characters, case_flags,
ch, saved_pos, applied, sequence)
applied = True
else:
# It's not a quantifier. Maybe it's a fuzzy constraint.
constraints = parse_fuzzy(source, ch)
if constraints:
# It _is_ a fuzzy constraint.
apply_constraint(source, info, constraints, characters,
case_flags, saved_pos, applied, sequence)
applied = True
else:
# The element was just a literal.
characters.append(ord(ch))
append_literal(characters, case_flags, sequence)
applied = False
else:
# We have a literal followed by something else.
append_literal(characters, case_flags, sequence)
sequence.append(element)
applied = False
return make_sequence(sequence)
def apply_quantifier(source, info, counts, characters, case_flags, ch,
saved_pos, applied, sequence):
if characters:
# The quantifier applies to the last character.
append_literal(characters[ : -1], case_flags, sequence)
element = Character(characters[-1], case_flags=case_flags)
else:
# The quantifier applies to the last item in the sequence.
if applied:
raise error("multiple repeat", source.string, saved_pos)
if not sequence:
raise error("nothing to repeat", source.string, saved_pos)
element = sequence.pop()
min_count, max_count = counts
saved_pos = source.pos
ch = source.get()
if ch == "?":
# The "?" suffix that means it's a lazy repeat.
repeated = LazyRepeat
elif ch == "+":
# The "+" suffix that means it's a possessive repeat.
repeated = PossessiveRepeat
else:
# No suffix means that it's a greedy repeat.
source.pos = saved_pos
repeated = GreedyRepeat
# Ignore the quantifier if it applies to a zero-width item or the number of
# repeats is fixed at 1.
if not element.is_empty() and (min_count != 1 or max_count != 1):
element = repeated(element, min_count, max_count)
sequence.append(element)
def apply_constraint(source, info, constraints, characters, case_flags,
saved_pos, applied, sequence):
if characters:
# The constraint applies to the last character.
append_literal(characters[ : -1], case_flags, sequence)
element = Character(characters[-1], case_flags=case_flags)
sequence.append(Fuzzy(element, constraints))
else:
# The constraint applies to the last item in the sequence.
if applied or not sequence:
raise error("nothing for fuzzy constraint", source.string,
saved_pos)
element = sequence.pop()
# If a group is marked as fuzzy then put all of the fuzzy part in the
# group.
if isinstance(element, Group):
element.subpattern = Fuzzy(element.subpattern, constraints)
sequence.append(element)
else:
sequence.append(Fuzzy(element, constraints))
def append_literal(characters, case_flags, sequence):
if characters:
sequence.append(Literal(characters, case_flags=case_flags))
def PossessiveRepeat(element, min_count, max_count):
"Builds a possessive repeat."
return Atomic(GreedyRepeat(element, min_count, max_count))
_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}
def parse_quantifier(source, info, ch):
"Parses a quantifier."
q = _QUANTIFIERS.get(ch)
if q:
# It's a quantifier.
return q
if ch == "{":
# Looks like a limited repeated element, eg. 'a{2,3}'.
counts = parse_limited_quantifier(source)
if counts:
return counts
return None
def is_above_limit(count):
"Checks whether a count is above the maximum."
return count is not None and count >= UNLIMITED
def parse_limited_quantifier(source):
"Parses a limited quantifier."
saved_pos = source.pos
min_count = parse_count(source)
if source.match(","):
max_count = parse_count(source)
# No minimum means 0 and no maximum means unlimited.
min_count = int(min_count or 0)
max_count = int(max_count) if max_count else None
if max_count is not None and min_count > max_count:
raise error("min repeat greater than max repeat", source.string,
saved_pos)
else:
if not min_count:
source.pos = saved_pos
return None
min_count = max_count = int(min_count)
if is_above_limit(min_count) or is_above_limit(max_count):
raise error("repeat count too big", source.string, saved_pos)
if not source.match ("}"):
source.pos = saved_pos
return None
return min_count, max_count
def parse_fuzzy(source, ch):
"Parses a fuzzy setting, if present."
if ch != "{":
return None
saved_pos = source.pos
constraints = {}
try:
parse_fuzzy_item(source, constraints)
while source.match(","):
parse_fuzzy_item(source, constraints)
except ParseError:
source.pos = saved_pos
return None
if not source.match("}"):
raise error("expected }", source.string, source.pos)
return constraints
def parse_fuzzy_item(source, constraints):
"Parses a fuzzy setting item."
saved_pos = source.pos
try:
parse_cost_constraint(source, constraints)
except ParseError:
source.pos = saved_pos
parse_cost_equation(source, constraints)
def parse_cost_constraint(source, constraints):
"Parses a cost constraint."
saved_pos = source.pos
ch = source.get()
if ch in ALPHA:
# Syntax: constraint [("<=" | "<") cost]
constraint = parse_constraint(source, constraints, ch)
max_inc = parse_fuzzy_compare(source)
if max_inc is None:
# No maximum cost.
constraints[constraint] = 0, None
else:
# There's a maximum cost.
cost_pos = source.pos
max_cost = parse_cost_limit(source)
# Inclusive or exclusive limit?
if not max_inc:
max_cost -= 1
if max_cost < 0:
raise error("bad fuzzy cost limit", source.string, cost_pos)
constraints[constraint] = 0, max_cost
elif ch in DIGITS:
# Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost
source.pos = saved_pos
# Minimum cost.
cost_pos = source.pos
min_cost = parse_cost_limit(source)
min_inc = parse_fuzzy_compare(source)
if min_inc is None:
raise ParseError()
constraint = parse_constraint(source, constraints, source.get())
max_inc = parse_fuzzy_compare(source)
if max_inc is None:
raise ParseError()
# Maximum cost.
cost_pos = source.pos
max_cost = parse_cost_limit(source)
# Inclusive or exclusive limits?
if not min_inc:
min_cost += 1
if not max_inc:
max_cost -= 1
if not 0 <= min_cost <= max_cost:
raise error("bad fuzzy cost limit", source.string, cost_pos)
constraints[constraint] = min_cost, max_cost
else:
raise ParseError()
def parse_cost_limit(source):
"Parses a cost limit."
cost_pos = source.pos
digits = parse_count(source)
try:
return int(digits)
except ValueError:
pass
raise error("bad fuzzy cost limit", source.string, cost_pos)
def parse_constraint(source, constraints, ch):
"Parses a constraint."
if ch not in "deis":
raise error("bad fuzzy constraint", source.string, source.pos)
if ch in constraints:
raise error("repeated fuzzy constraint", source.string, source.pos)
return ch
def parse_fuzzy_compare(source):
"Parses a cost comparator."
if source.match("<="):
return True
elif source.match("<"):
return False
else:
return None
def parse_cost_equation(source, constraints):
"Parses a cost equation."
if "cost" in constraints:
raise error("more than one cost equation", source.string, source.pos)
cost = {}
parse_cost_term(source, cost)
while source.match("+"):
parse_cost_term(source, cost)
max_inc = parse_fuzzy_compare(source)
if max_inc is None:
raise error("missing fuzzy cost limit", source.string, source.pos)
max_cost = int(parse_count(source))
if not max_inc:
max_cost -= 1
if max_cost < 0:
raise error("bad fuzzy cost limit", source.string, source.pos)
cost["max"] = max_cost
constraints["cost"] = cost
def parse_cost_term(source, cost):
"Parses a cost equation term."
coeff = parse_count(source)
ch = source.get()
if ch not in "dis":
raise ParseError()
if ch in cost:
raise error("repeated fuzzy cost", source.string, source.pos)
cost[ch] = int(coeff or 1)
def parse_count(source):
"Parses a quantifier's count, which can be empty."
return source.get_while(DIGITS)
def parse_literal_and_element(source, info):
"""Parses a literal followed by an element. The element is FLAGS if it's an
inline flag or None if it has reached the end of a sequence.
"""
characters = []
case_flags = info.flags & CASE_FLAGS
while True:
saved_pos = source.pos
ch = source.get()
if ch in SPECIAL_CHARS:
if ch in ")|":
# The end of a sequence. At the end of the pattern ch is "".
source.pos = saved_pos
return characters, case_flags, None
elif ch == "\\":
# An escape sequence outside a set.
element = parse_escape(source, info, False)
return characters, case_flags, element
elif ch == "(":
# A parenthesised subpattern or a flag.
element = parse_paren(source, info)
if element and element is not COMMENT:
return characters, case_flags, element
elif ch == ".":
# Any character.
if info.flags & DOTALL:
element = AnyAll()
elif info.flags & WORD:
element = AnyU()
else:
element = Any()
return characters, case_flags, element
elif ch == "[":
# A character set.
element = parse_set(source, info)
return characters, case_flags, element
elif ch == "^":
# The start of a line or the string.
if info.flags & MULTILINE:
if info.flags & WORD:
element = StartOfLineU()
else:
element = StartOfLine()
else:
element = StartOfString()
return characters, case_flags, element
elif ch == "$":
# The end of a line or the string.
if info.flags & MULTILINE:
if info.flags & WORD:
element = EndOfLineU()
else:
element = EndOfLine()
else:
if info.flags & WORD:
element = EndOfStringLineU()
else:
element = EndOfStringLine()
return characters, case_flags, element
elif ch in "?*+{":
# Looks like a quantifier.
return characters, case_flags, (ch, saved_pos)
else:
# A literal.
characters.append(ord(ch))
else:
# A literal.
characters.append(ord(ch))
def parse_paren(source, info):
"""Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an
inline flag.
"""
saved_pos = source.pos
ch = source.get()
if ch == "?":
# (?...
saved_pos_2 = source.pos
ch = source.get()
if ch == "<":
# (?<...
saved_pos_3 = source.pos
ch = source.get()
if ch in ("=", "!"):
# (?<=... or (?<!...: lookbehind.
return parse_lookaround(source, info, True, ch == "=")
# (?<...: a named capture group.
source.pos = saved_pos_3
name = parse_name(source)
group = info.open_group(name)
source.expect(">")
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
info.close_group()
return Group(info, group, subpattern)
if ch in ("=", "!"):
# (?=... or (?!...: lookahead.
return parse_lookaround(source, info, False, ch == "=")
if ch == "P":
# (?P...: a Python extension.
return parse_extension(source, info)
if ch == "#":
# (?#...: a comment.
return parse_comment(source)
if ch == "(":
# (?(...: a conditional subpattern.
return parse_conditional(source, info)
if ch == ">":
# (?>...: an atomic subpattern.
return parse_atomic(source, info)
if ch == "|":
# (?|...: a common/reset groups branch.
return parse_common(source, info)
if ch == "R" or "0" <= ch <= "9":
# (?R...: probably a call to a group.
return parse_call_group(source, info, ch, saved_pos_2)
if ch == "&":
# (?&...: a call to a named group.
return parse_call_named_group(source, info, saved_pos_2)
# (?...: probably a flags subpattern.
source.pos = saved_pos_2
return parse_flags_subpattern(source, info)
if ch == "*":
# (*...
saved_pos_2 = source.pos
word = source.get_while(set(")>"), include=False)
if word[ : 1].isalpha():
verb = VERBS.get(word)
if not verb:
raise error("unknown verb", source.string, saved_pos_2)
source.expect(")")
return verb
# (...: an unnamed capture group.
source.pos = saved_pos
group = info.open_group()
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
info.close_group()
return Group(info, group, subpattern)
def parse_extension(source, info):
"Parses a Python extension."
saved_pos = source.pos
ch = source.get()
if ch == "<":
# (?P<...: a named capture group.
name = parse_name(source)
group = info.open_group(name)
source.expect(">")
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
info.close_group()
return Group(info, group, subpattern)
if ch == "=":
# (?P=...: a named group reference.
name = parse_name(source, allow_numeric=True)
source.expect(")")
if info.is_open_group(name):
raise error("cannot refer to an open group", source.string,
saved_pos)
return make_ref_group(info, name, saved_pos)
if ch == ">" or ch == "&":
# (?P>...: a call to a group.
return parse_call_named_group(source, info, saved_pos)
source.pos = saved_pos
raise error("unknown extension", source.string, saved_pos)
def parse_comment(source):
"Parses a comment."
source.skip_while(set(")"), include=False)
source.expect(")")
return COMMENT
def parse_lookaround(source, info, behind, positive):
"Parses a lookaround."
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
return LookAround(behind, positive, subpattern)
def parse_conditional(source, info):
"Parses a conditional subpattern."
saved_flags = info.flags
saved_pos = source.pos
ch = source.get()
if ch == "?":
# (?(?...
ch = source.get()
if ch in ("=", "!"):
# (?(?=... or (?(?!...: lookahead conditional.
return parse_lookaround_conditional(source, info, False, ch == "=")
if ch == "<":
# (?(?<...
ch = source.get()
if ch in ("=", "!"):
# (?(?<=... or (?(?<!...: lookbehind conditional.
return parse_lookaround_conditional(source, info, True, ch ==
"=")
source.pos = saved_pos
raise error("expected lookaround conditional", source.string,
source.pos)
source.pos = saved_pos
try:
group = parse_name(source, True)
source.expect(")")
yes_branch = parse_sequence(source, info)
if source.match("|"):
no_branch = parse_sequence(source, info)
else:
no_branch = Sequence()
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
if yes_branch.is_empty() and no_branch.is_empty():
return Sequence()
return Conditional(info, group, yes_branch, no_branch, saved_pos)
def parse_lookaround_conditional(source, info, behind, positive):
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
yes_branch = parse_sequence(source, info)
if source.match("|"):
no_branch = parse_sequence(source, info)
else:
no_branch = Sequence()
source.expect(")")
return LookAroundConditional(behind, positive, subpattern, yes_branch,
no_branch)
def parse_atomic(source, info):
"Parses an atomic subpattern."
saved_flags = info.flags
try:
subpattern = _parse_pattern(source, info)
source.expect(")")
finally:
info.flags = saved_flags
source.ignore_space = bool(info.flags & VERBOSE)
return Atomic(subpattern)
def parse_common(source, info):
"Parses a common groups branch."