diff --git a/spec/std/regex_spec.cr b/spec/std/regex_spec.cr index f8cb2c926549..fc8d5a708648 100644 --- a/spec/std/regex_spec.cr +++ b/spec/std/regex_spec.cr @@ -19,8 +19,8 @@ describe "Regex" do {% if Regex::Engine.resolve.name == "Regex::PCRE" %} Regex.new("^/foo$", Regex::Options.new(0x00000020)).matches?("/foo\n").should be_false {% else %} - expect_raises ArgumentError, "Unknown Regex::Option value: 32" do - Regex.new("", Regex::Options.new(32)) + expect_raises ArgumentError, "Unknown Regex::Option value: 64" do + Regex.new("", Regex::Options.new(0x00000040)) end {% end %} end diff --git a/src/regex.cr b/src/regex.cr index 136671e2f65c..8f8c1484a6e6 100644 --- a/src/regex.cr +++ b/src/regex.cr @@ -208,9 +208,10 @@ class Regex } @[Flags] - enum Options + enum Options : UInt64 # Case insensitive match. - IGNORE_CASE = 1 + IGNORE_CASE = 0x0000_0001 + # PCRE native `PCRE_MULTILINE` flag is `2`, and `PCRE_DOTALL` is `4` # - `PCRE_DOTALL` changes the "`.`" meaning # - `PCRE_MULTILINE` changes "`^`" and "`$`" meanings @@ -218,19 +219,35 @@ class Regex # Crystal modifies this meaning to have essentially one unique "`m`" # flag that activates both behaviours, so here we do the same by # mapping `MULTILINE` to `PCRE_MULTILINE | PCRE_DOTALL`. - MULTILINE = 6 + # The same applies for PCRE2 except that the native values are 0x200 and 0x400. + + # Multiline matching. + # + # Equivalent to `MULTILINE | DOTALL` in PCRE and PCRE2. + MULTILINE = 0x0000_0006 + + DOTALL = 0x0000_0002 + # Ignore white space and `#` comments. - EXTENDED = 8 + EXTENDED = 0x0000_0008 + # Force pattern anchoring. - ANCHORED = 16 + ANCHORED = 0x0000_0010 + + DOLLAR_ENDONLY = 0x0000_0020 + FIRSTLINE = 0x0004_0000 + # :nodoc: - UTF_8 = 0x00000800 + UTF_8 = 0x0000_0800 # :nodoc: - NO_UTF8_CHECK = 0x00002000 + NO_UTF8_CHECK = 0x0000_2000 # :nodoc: - DUPNAMES = 0x00080000 + DUPNAMES = 0x0008_0000 # :nodoc: - UCP = 0x20000000 + UCP = 0x2000_0000 + + ENDANCHORED = 0x8000_0000 + NO_JIT end # Returns a `Regex::Options` representing the optional flags applied to this `Regex`. diff --git a/src/regex/lib_pcre.cr b/src/regex/lib_pcre.cr index cf32142c5358..c5811d348b99 100644 --- a/src/regex/lib_pcre.cr +++ b/src/regex/lib_pcre.cr @@ -2,15 +2,84 @@ lib LibPCRE alias Int = LibC::Int - CASELESS = 0x00000001 - MULTILINE = 0x00000002 - DOTALL = 0x00000004 - EXTENDED = 0x00000008 - ANCHORED = 0x00000010 - UTF8 = 0x00000800 - NO_UTF8_CHECK = 0x00002000 - DUPNAMES = 0x00080000 - UCP = 0x20000000 + # Public options. Some are compile-time only, some are run-time only, and some + # are both. Most of the compile-time options are saved with the compiled regex so + # that they can be inspected during studying (and therefore JIT compiling). Note + # that pcre_study() has its own set of options. Originally, all the options + # defined here used distinct bits. However, almost all the bits in a 32-bit word + # are now used, so in order to conserve them, option bits that were previously + # only recognized at matching time (i.e. by pcre_exec() or pcre_dfa_exec()) may + # also be used for compile-time options that affect only compiling and are not + # relevant for studying or JIT compiling. + + # Some options for pcre_compile() change its behaviour but do not affect the + # behaviour of the execution functions. Other options are passed through to the + # execution functions and affect their behaviour, with or without affecting the + # behaviour of pcre_compile(). + + # Options that can be passed to pcre_compile() are tagged Cx below, with these + # variants: + + # C1 Affects compile only + # C2 Does not affect compile; affects exec, dfa_exec + # C3 Affects compile, exec, dfa_exec + # C4 Affects compile, exec, dfa_exec, study + # C5 Affects compile, exec, study + + # Options that can be set for pcre_exec() and/or pcre_dfa_exec() are flagged with + # E and D, respectively. They take precedence over C3, C4, and C5 settings passed + # from pcre_compile(). Those that are compatible with JIT execution are flagged + # with J. + + CASELESS = 0x00000001 + MULTILINE = 0x00000002 + DOTALL = 0x00000004 + EXTENDED = 0x00000008 + ANCHORED = 0x00000010 + DOLLAR_ENDONLY = 0x00000020 + + EXTRA = 0x00000040 # C1 + NOTBOL = 0x00000080 # E D J + NOTEOL = 0x00000100 # E D J + UNGREEDY = 0x00000200 # C1 + NOTEMPTY = 0x00000400 # E D J + UTF8 = 0x00000800 # C4 ) + UTF16 = 0x00000800 # C4 ) Synonyms + UTF32 = 0x00000800 # C4 ) + NO_AUTO_CAPTURE = 0x00001000 # C1 + NO_UTF8_CHECK = 0x00002000 # C1 E D J ) + NO_UTF16_CHECK = 0x00002000 # C1 E D J ) Synonyms + NO_UTF32_CHECK = 0x00002000 # C1 E D J ) + AUTO_CALLOUT = 0x00004000 # C1 + PARTIAL_SOFT = 0x00008000 # E D J ) Synonyms + PARTIAL = 0x00008000 # E D J ) + + # This pair use the same bit. + NEVER_UTF = 0x00010000 # C1 ) Overlaid + DFA_SHORTEST = 0x00010000 # D ) Overlaid + NOTBOS = 0x00010000 # D ) Overlaid + + # This pair use the same bit. + NO_AUTO_POSSESS = 0x00020000 # C1 ) Overlaid + DFA_RESTART = 0x00020000 # D ) Overlaid + NOTEOS = 0x00020000 # D ) Overlaid + + FIRSTLINE = 0x00040000 # C3 + DUPNAMES = 0x00080000 # C1 + NEWLINE_CR = 0x00100000 # C3 E D + NEWLINE_LF = 0x00200000 # C3 E D + NEWLINE_CRLF = 0x00300000 # C3 E D + NEWLINE_ANY = 0x00400000 # C3 E D + NEWLINE_ANYCRLF = 0x00500000 # C3 E D + BSR_ANYCRLF = 0x00800000 # C3 E D + BSR_UNICODE = 0x01000000 # C3 E D + JAVASCRIPT_COMPAT = 0x02000000 # C5 + NO_START_OPTIMIZE = 0x04000000 # C2 E D ) Synonyms + NO_START_OPTIMISE = 0x04000000 # C2 E D ) + PARTIAL_HARD = 0x08000000 # E D J + NOTEMPTY_ATSTART = 0x10000000 # E D J + UCP = 0x20000000 # C3 + NOTGPOS = 0x40000000 # C3 type Pcre = Void* type PcreExtra = Void* diff --git a/src/regex/lib_pcre2.cr b/src/regex/lib_pcre2.cr index 5fac84326a13..87e584702246 100644 --- a/src/regex/lib_pcre2.cr +++ b/src/regex/lib_pcre2.cr @@ -4,37 +4,98 @@ lib LibPCRE2 UNSET = ~LibC::SizeT.new(0) - ANCHORED = 0x80000000 - NO_UTF_CHECK = 0x40000000 - ENDANCHORED = 0x20000000 - - ALLOW_EMPTY_CLASS = 0x00000001 - ALT_BSUX = 0x00000002 - AUTO_CALLOUT = 0x00000004 - CASELESS = 0x00000008 - DOLLAR_ENDONLY = 0x00000010 - DOTALL = 0x00000020 - DUPNAMES = 0x00000040 - EXTENDED = 0x00000080 - FIRSTLINE = 0x00000100 - MATCH_UNSET_BACKREF = 0x00000200 - MULTILINE = 0x00000400 - NEVER_UCP = 0x00000800 - NEVER_UTF = 0x00001000 - NO_AUTO_CAPTURE = 0x00002000 - NO_AUTO_POSSESS = 0x00004000 - NO_DOTSTAR_ANCHOR = 0x00008000 - NO_START_OPTIMIZE = 0x00010000 - UCP = 0x00020000 - UNGREEDY = 0x00040000 - UTF = 0x00080000 - NEVER_BACKSLASH_C = 0x00100000 - ALT_CIRCUMFLEX = 0x00200000 - ALT_VERBNAMES = 0x00400000 - USE_OFFSET_LIMIT = 0x00800000 - EXTENDED_MORE = 0x01000000 - LITERAL = 0x02000000 - MATCH_INVALID_UTF = 0x04000000 + # The following option bits can be passed to pcre2_compile(), pcre2_match(), + # or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it + # is passed. Put these bits at the most significant end of the options word so + # others can be added next to them + + ANCHORED = 0x80000000_u32 + NO_UTF_CHECK = 0x40000000_u32 + ENDANCHORED = 0x20000000_u32 + + # The following option bits can be passed only to pcre2_compile(). However, + # they may affect compilation, JIT compilation, and/or interpretive execution. + # The following tags indicate which: + + # C alters what is compiled by pcre2_compile() + # J alters what is compiled by pcre2_jit_compile() + # M is inspected during pcre2_match() execution + # D is inspected during pcre2_dfa_match() execution + + ALLOW_EMPTY_CLASS = 0x00000001_u32 # C + ALT_BSUX = 0x00000002_u32 # C + AUTO_CALLOUT = 0x00000004_u32 # C + CASELESS = 0x00000008_u32 # C + DOLLAR_ENDONLY = 0x00000010_u32 # J M D + DOTALL = 0x00000020_u32 # C + DUPNAMES = 0x00000040_u32 # C + EXTENDED = 0x00000080_u32 # C + FIRSTLINE = 0x00000100_u32 # J M D + MATCH_UNSET_BACKREF = 0x00000200_u32 # C J M + MULTILINE = 0x00000400_u32 # C + NEVER_UCP = 0x00000800_u32 # C + NEVER_UTF = 0x00001000_u32 # C + NO_AUTO_CAPTURE = 0x00002000_u32 # C + NO_AUTO_POSSESS = 0x00004000_u32 # C + NO_DOTSTAR_ANCHOR = 0x00008000_u32 # C + NO_START_OPTIMIZE = 0x00010000_u32 # J M D + UCP = 0x00020000_u32 # C J M D + UNGREEDY = 0x00040000_u32 # C + UTF = 0x00080000_u32 # C J M D + NEVER_BACKSLASH_C = 0x00100000_u32 # C + ALT_CIRCUMFLEX = 0x00200000_u32 # J M D + ALT_VERBNAMES = 0x00400000_u32 # C + USE_OFFSET_LIMIT = 0x00800000_u32 # J M D + EXTENDED_MORE = 0x01000000_u32 # C + LITERAL = 0x02000000_u32 # C + MATCH_INVALID_UTF = 0x04000000_u32 # J M D + + # An additional compile options word is available in the compile context. + + EXTRA_ALLOW_SURROGATE_ESCAPES = 0x00000001_u32 # C + EXTRA_BAD_ESCAPE_IS_LITERAL = 0x00000002_u32 # C + EXTRA_MATCH_WORD = 0x00000004_u32 # C + EXTRA_MATCH_LINE = 0x00000008_u32 # C + EXTRA_ESCAPED_CR_IS_LF = 0x00000010_u32 # C + EXTRA_ALT_BSUX = 0x00000020_u32 # C + EXTRA_ALLOW_LOOKAROUND_BSK = 0x00000040_u32 # C + EXTRA_CASELESS_RESTRICT = 0x00000080_u32 # C + EXTRA_ASCII_BSD = 0x00000100_u32 # C + EXTRA_ASCII_BSS = 0x00000200_u32 # C + EXTRA_ASCII_BSW = 0x00000400_u32 # C + EXTRA_ASCII_POSIX = 0x00000800_u32 # C + + # These are for pcre2_jit_compile(). + + JIT_COMPLETE = 0x00000001_u32 # For full matching + JIT_PARTIAL_SOFT = 0x00000002_u32 + JIT_PARTIAL_HARD = 0x00000004_u32 + JIT_INVALID_UTF = 0x00000100_u32 + + # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and + # pcre2_substitute(). Some are allowed only for one of the functions, and in + # these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and + # PCRE2_NO_UTF_CHECK can also be passed to these functions (though + # pcre2_jit_match() ignores the latter since it bypasses all sanity checks). + + NOTBOL = 0x00000001_u32 + NOTEOL = 0x00000002_u32 + NOTEMPTY = 0x00000004_u32 # ) These two must be kept + NOTEMPTY_ATSTART = 0x00000008_u32 # ) adjacent to each other. + PARTIAL_SOFT = 0x00000010_u32 + PARTIAL_HARD = 0x00000020_u32 + DFA_RESTART = 0x00000040_u32 # pcre2_dfa_match() only + DFA_SHORTEST = 0x00000080_u32 # pcre2_dfa_match() only + SUBSTITUTE_GLOBAL = 0x00000100_u32 # pcre2_substitute() only + SUBSTITUTE_EXTENDED = 0x00000200_u32 # pcre2_substitute() only + SUBSTITUTE_UNSET_EMPTY = 0x00000400_u32 # pcre2_substitute() only + SUBSTITUTE_UNKNOWN_UNSET = 0x00000800_u32 # pcre2_substitute() only + SUBSTITUTE_OVERFLOW_LENGTH = 0x00001000_u32 # pcre2_substitute() only + NO_JIT = 0x00002000_u32 # Not for pcre2_dfa_match() + COPY_MATCHED_SUBJECT = 0x00004000_u32 + SUBSTITUTE_LITERAL = 0x00008000_u32 # pcre2_substitute() only + SUBSTITUTE_MATCHED = 0x00010000_u32 # pcre2_substitute() only + SUBSTITUTE_REPLACEMENT_ONLY = 0x00020000_u32 # pcre2_substitute() only enum Error # "Expected" matching error codes: no match and partial match. @@ -185,10 +246,6 @@ lib LibPCRE2 type MatchContext = Void* fun match_context_create = pcre2_match_context_create_8(gcontext : Void*) : MatchContext* - JIT_COMPLETE = 0x00000001_u32 # For full matching - JIT_PARTIAL_SOFT = 0x00000002_u32 - JIT_PARTIAL_HARD = 0x00000004_u32 - JIT_INVALID_UTF = 0x00000100_u32 fun jit_compile = pcre2_jit_compile_8(code : Code*, options : UInt32) : Int type JITStack = Void diff --git a/src/regex/pcre.cr b/src/regex/pcre.cr index 9d006c59dcd8..f8a337090230 100644 --- a/src/regex/pcre.cr +++ b/src/regex/pcre.cr @@ -7,7 +7,7 @@ module Regex::PCRE source = source.gsub('\u{0}', "\\0") @source = source - @re = LibPCRE.compile(@source, pcre_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil) + @re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil) raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null? @extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr) if @extra.null? && studyerrptr @@ -19,19 +19,55 @@ module Regex::PCRE LibPCRE.full_info(@re, nil, LibPCRE::INFO_CAPTURECOUNT, out @captures) end - private def pcre_options(options) + private def pcre_compile_options(options) flag = 0 Regex::Options.each do |option| if options.includes?(option) flag |= case option - when .ignore_case? then LibPCRE::CASELESS - when .multiline? then LibPCRE::DOTALL | LibPCRE::MULTILINE - when .extended? then LibPCRE::EXTENDED - when .anchored? then LibPCRE::ANCHORED - when .utf_8? then LibPCRE::UTF8 - when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK - when .dupnames? then LibPCRE::DUPNAMES - when .ucp? then LibPCRE::UCP + when .ignore_case? then LibPCRE::CASELESS + when .multiline? then LibPCRE::DOTALL | LibPCRE::MULTILINE + when .dotall? then LibPCRE::DOTALL + when .extended? then LibPCRE::EXTENDED + when .anchored? then LibPCRE::ANCHORED + when .dollar_endonly? then LibPCRE::DOLLAR_ENDONLY + when .firstline? then LibPCRE::FIRSTLINE + when .utf_8? then LibPCRE::UTF8 + when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK + when .dupnames? then LibPCRE::DUPNAMES + when .ucp? then LibPCRE::UCP + when .endanchored? then raise ArgumentError.new("Regex::Option::ENDANCHORED is not supported with PCRE") + when .no_jit? then raise ArgumentError.new("Invalid regex option NO_JIT for `pcre_compile`") + else + raise "unreachable" + end + options &= ~option + end + end + + # Unnamed values are explicitly used PCRE options, just pass them through: + flag |= options.value + + flag + end + + private def pcre_match_options(options) + flag = 0 + Regex::Options.each do |option| + if options.includes?(option) + flag |= case option + when .ignore_case? then raise ArgumentError.new("Invalid regex option IGNORE_CASE for `pcre_exec`") + when .multiline? then raise ArgumentError.new("Invalid regex option MULTILINE for `pcre_exec`") + when .dotall? then raise ArgumentError.new("Invalid regex option DOTALL for `pcre_exec`") + when .extended? then raise ArgumentError.new("Invalid regex option EXTENDED for `pcre_exec`") + when .anchored? then LibPCRE::ANCHORED + when .dollar_endonly? then raise ArgumentError.new("Invalid regex option DOLLAR_ENDONLY for `pcre_exec`") + when .firstline? then raise ArgumentError.new("Invalid regex option FIRSTLINE for `pcre_exec`") + when .utf_8? then raise ArgumentError.new("Invalid regex option UTF_8 for `pcre_exec`") + when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK + when .dupnames? then raise ArgumentError.new("Invalid regex option DUPNAMES for `pcre_exec`") + when .ucp? then raise ArgumentError.new("Invalid regex option UCP for `pcre_exec`") + when .endanchored? then raise ArgumentError.new("Regex::Option::ENDANCHORED is not supported with PCRE") + when .no_jit? then raise ArgumentError.new("Regex::Option::NO_JIT is not supported with PCRE") else raise "unreachable" end @@ -106,7 +142,7 @@ module Regex::PCRE # Calls `pcre_exec` C function, and handles returning value. private def internal_matches?(str, byte_index, options, ovector, ovector_size) - ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size) + ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size) # TODO: when `ret < -1`, it means PCRE error. It should handle correctly. ret >= 0 end diff --git a/src/regex/pcre2.cr b/src/regex/pcre2.cr index ff9b1300d40b..96d863aea73d 100644 --- a/src/regex/pcre2.cr +++ b/src/regex/pcre2.cr @@ -8,7 +8,7 @@ module Regex::PCRE2 # :nodoc: def initialize(*, _source @source : String, _options @options) - @re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message| + @re = PCRE2.compile(source, pcre2_compile_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message| raise ArgumentError.new(error_message) end @@ -41,19 +41,54 @@ module Regex::PCRE2 end end - private def pcre2_options(options) + private def pcre2_compile_options(options) flag = 0 Regex::Options.each do |option| if options.includes?(option) flag |= case option - when .ignore_case? then LibPCRE2::CASELESS - when .multiline? then LibPCRE2::DOTALL | LibPCRE2::MULTILINE - when .extended? then LibPCRE2::EXTENDED - when .anchored? then LibPCRE2::ANCHORED - when .utf_8? then LibPCRE2::UTF - when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK - when .dupnames? then LibPCRE2::DUPNAMES - when .ucp? then LibPCRE2::UCP + when .ignore_case? then LibPCRE2::CASELESS + when .multiline? then LibPCRE2::DOTALL | LibPCRE2::MULTILINE + when .dotall? then LibPCRE2::DOTALL + when .extended? then LibPCRE2::EXTENDED + when .anchored? then LibPCRE2::ANCHORED + when .dollar_endonly? then LibPCRE2::DOLLAR_ENDONLY + when .firstline? then LibPCRE2::FIRSTLINE + when .utf_8? then LibPCRE2::UTF + when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK + when .dupnames? then LibPCRE2::DUPNAMES + when .ucp? then LibPCRE2::UCP + when .endanchored? then LibPCRE2::ENDANCHORED + when .no_jit? then raise ArgumentError.new("Invalid regex option NO_JIT for `pcre2_compile`") + else + raise "unreachable" + end + options &= ~option + end + end + unless options.none? + raise ArgumentError.new("Unknown Regex::Option value: #{options}") + end + flag + end + + private def pcre2_match_options(options) + flag = 0 + Regex::Options.each do |option| + if options.includes?(option) + flag |= case option + when .ignore_case? then raise ArgumentError.new("Invalid regex option IGNORE_CASE for `pcre2_match`") + when .multiline? then raise ArgumentError.new("Invalid regex option MULTILINE for `pcre2_match`") + when .dotall? then raise ArgumentError.new("Invalid regex option DOTALL for `pcre2_match`") + when .extended? then raise ArgumentError.new("Invalid regex option EXTENDED for `pcre2_match`") + when .anchored? then LibPCRE2::ANCHORED + when .dollar_endonly? then raise ArgumentError.new("Invalid regex option DOLLAR_ENDONLY for `pcre2_match`") + when .firstline? then raise ArgumentError.new("Invalid regex option FIRSTLINE for `pcre2_match`") + when .utf_8? then raise ArgumentError.new("Invalid regex option UTF_8 for `pcre2_match`") + when .no_utf8_check? then LibPCRE2::NO_UTF_CHECK + when .dupnames? then raise ArgumentError.new("Invalid regex option DUPNAMES for `pcre2_match`") + when .ucp? then raise ArgumentError.new("Invalid regex option UCP for `pcre2_match`") + when .endanchored? then LibPCRE2::ENDANCHORED + when .no_jit? then LibPCRE2::NO_JIT else raise "unreachable" end @@ -186,7 +221,7 @@ module Regex::PCRE2 private def match_data(str, byte_index, options) match_data = self.match_data - match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, PCRE2.match_context) + match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_match_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, PCRE2.match_context) if match_count < 0 case error = LibPCRE2::Error.new(match_count)