Skip to content

Commit

Permalink
Add more members to Regex::Options (#13223)
Browse files Browse the repository at this point in the history
  • Loading branch information
straight-shoota committed Mar 31, 2023
1 parent 38155d1 commit b31b07d
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 77 deletions.
4 changes: 2 additions & 2 deletions spec/std/regex_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ describe "Regex" do
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
Regex.new("^/foo$", Regex::Options.new(0x00000020)).matches?("/foo\n").should be_false
{% else %}
expect_raises ArgumentError, "Unknown Regex::Option value: 32" do
Regex.new("", Regex::Options.new(32))
expect_raises ArgumentError, "Unknown Regex::Option value: 64" do
Regex.new("", Regex::Options.new(0x00000040))
end
{% end %}
end
Expand Down
35 changes: 26 additions & 9 deletions src/regex.cr
Original file line number Diff line number Diff line change
Expand Up @@ -208,29 +208,46 @@ class Regex
}

@[Flags]
enum Options
enum Options : UInt64
# Case insensitive match.
IGNORE_CASE = 1
IGNORE_CASE = 0x0000_0001

# PCRE native `PCRE_MULTILINE` flag is `2`, and `PCRE_DOTALL` is `4`
# - `PCRE_DOTALL` changes the "`.`" meaning
# - `PCRE_MULTILINE` changes "`^`" and "`$`" meanings
#
# Crystal modifies this meaning to have essentially one unique "`m`"
# flag that activates both behaviours, so here we do the same by
# mapping `MULTILINE` to `PCRE_MULTILINE | PCRE_DOTALL`.
MULTILINE = 6
# The same applies for PCRE2 except that the native values are 0x200 and 0x400.

# Multiline matching.
#
# Equivalent to `MULTILINE | DOTALL` in PCRE and PCRE2.
MULTILINE = 0x0000_0006

DOTALL = 0x0000_0002

# Ignore white space and `#` comments.
EXTENDED = 8
EXTENDED = 0x0000_0008

# Force pattern anchoring.
ANCHORED = 16
ANCHORED = 0x0000_0010

DOLLAR_ENDONLY = 0x0000_0020
FIRSTLINE = 0x0004_0000

# :nodoc:
UTF_8 = 0x00000800
UTF_8 = 0x0000_0800
# :nodoc:
NO_UTF8_CHECK = 0x00002000
NO_UTF8_CHECK = 0x0000_2000
# :nodoc:
DUPNAMES = 0x00080000
DUPNAMES = 0x0008_0000
# :nodoc:
UCP = 0x20000000
UCP = 0x2000_0000

ENDANCHORED = 0x8000_0000
NO_JIT
end

# Returns a `Regex::Options` representing the optional flags applied to this `Regex`.
Expand Down
87 changes: 78 additions & 9 deletions src/regex/lib_pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,84 @@
lib LibPCRE
alias Int = LibC::Int

CASELESS = 0x00000001
MULTILINE = 0x00000002
DOTALL = 0x00000004
EXTENDED = 0x00000008
ANCHORED = 0x00000010
UTF8 = 0x00000800
NO_UTF8_CHECK = 0x00002000
DUPNAMES = 0x00080000
UCP = 0x20000000
# Public options. Some are compile-time only, some are run-time only, and some
# are both. Most of the compile-time options are saved with the compiled regex so
# that they can be inspected during studying (and therefore JIT compiling). Note
# that pcre_study() has its own set of options. Originally, all the options
# defined here used distinct bits. However, almost all the bits in a 32-bit word
# are now used, so in order to conserve them, option bits that were previously
# only recognized at matching time (i.e. by pcre_exec() or pcre_dfa_exec()) may
# also be used for compile-time options that affect only compiling and are not
# relevant for studying or JIT compiling.

# Some options for pcre_compile() change its behaviour but do not affect the
# behaviour of the execution functions. Other options are passed through to the
# execution functions and affect their behaviour, with or without affecting the
# behaviour of pcre_compile().

# Options that can be passed to pcre_compile() are tagged Cx below, with these
# variants:

# C1 Affects compile only
# C2 Does not affect compile; affects exec, dfa_exec
# C3 Affects compile, exec, dfa_exec
# C4 Affects compile, exec, dfa_exec, study
# C5 Affects compile, exec, study

# Options that can be set for pcre_exec() and/or pcre_dfa_exec() are flagged with
# E and D, respectively. They take precedence over C3, C4, and C5 settings passed
# from pcre_compile(). Those that are compatible with JIT execution are flagged
# with J.

CASELESS = 0x00000001
MULTILINE = 0x00000002
DOTALL = 0x00000004
EXTENDED = 0x00000008
ANCHORED = 0x00000010
DOLLAR_ENDONLY = 0x00000020

EXTRA = 0x00000040 # C1
NOTBOL = 0x00000080 # E D J
NOTEOL = 0x00000100 # E D J
UNGREEDY = 0x00000200 # C1
NOTEMPTY = 0x00000400 # E D J
UTF8 = 0x00000800 # C4 )
UTF16 = 0x00000800 # C4 ) Synonyms
UTF32 = 0x00000800 # C4 )
NO_AUTO_CAPTURE = 0x00001000 # C1
NO_UTF8_CHECK = 0x00002000 # C1 E D J )
NO_UTF16_CHECK = 0x00002000 # C1 E D J ) Synonyms
NO_UTF32_CHECK = 0x00002000 # C1 E D J )
AUTO_CALLOUT = 0x00004000 # C1
PARTIAL_SOFT = 0x00008000 # E D J ) Synonyms
PARTIAL = 0x00008000 # E D J )

# This pair use the same bit.
NEVER_UTF = 0x00010000 # C1 ) Overlaid
DFA_SHORTEST = 0x00010000 # D ) Overlaid
NOTBOS = 0x00010000 # D ) Overlaid

# This pair use the same bit.
NO_AUTO_POSSESS = 0x00020000 # C1 ) Overlaid
DFA_RESTART = 0x00020000 # D ) Overlaid
NOTEOS = 0x00020000 # D ) Overlaid

FIRSTLINE = 0x00040000 # C3
DUPNAMES = 0x00080000 # C1
NEWLINE_CR = 0x00100000 # C3 E D
NEWLINE_LF = 0x00200000 # C3 E D
NEWLINE_CRLF = 0x00300000 # C3 E D
NEWLINE_ANY = 0x00400000 # C3 E D
NEWLINE_ANYCRLF = 0x00500000 # C3 E D
BSR_ANYCRLF = 0x00800000 # C3 E D
BSR_UNICODE = 0x01000000 # C3 E D
JAVASCRIPT_COMPAT = 0x02000000 # C5
NO_START_OPTIMIZE = 0x04000000 # C2 E D ) Synonyms
NO_START_OPTIMISE = 0x04000000 # C2 E D )
PARTIAL_HARD = 0x08000000 # E D J
NOTEMPTY_ATSTART = 0x10000000 # E D J
UCP = 0x20000000 # C3
NOTGPOS = 0x40000000 # C3

type Pcre = Void*
type PcreExtra = Void*
Expand Down
127 changes: 92 additions & 35 deletions src/regex/lib_pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,98 @@ lib LibPCRE2

UNSET = ~LibC::SizeT.new(0)

ANCHORED = 0x80000000
NO_UTF_CHECK = 0x40000000
ENDANCHORED = 0x20000000

ALLOW_EMPTY_CLASS = 0x00000001
ALT_BSUX = 0x00000002
AUTO_CALLOUT = 0x00000004
CASELESS = 0x00000008
DOLLAR_ENDONLY = 0x00000010
DOTALL = 0x00000020
DUPNAMES = 0x00000040
EXTENDED = 0x00000080
FIRSTLINE = 0x00000100
MATCH_UNSET_BACKREF = 0x00000200
MULTILINE = 0x00000400
NEVER_UCP = 0x00000800
NEVER_UTF = 0x00001000
NO_AUTO_CAPTURE = 0x00002000
NO_AUTO_POSSESS = 0x00004000
NO_DOTSTAR_ANCHOR = 0x00008000
NO_START_OPTIMIZE = 0x00010000
UCP = 0x00020000
UNGREEDY = 0x00040000
UTF = 0x00080000
NEVER_BACKSLASH_C = 0x00100000
ALT_CIRCUMFLEX = 0x00200000
ALT_VERBNAMES = 0x00400000
USE_OFFSET_LIMIT = 0x00800000
EXTENDED_MORE = 0x01000000
LITERAL = 0x02000000
MATCH_INVALID_UTF = 0x04000000
# The following option bits can be passed to pcre2_compile(), pcre2_match(),
# or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it
# is passed. Put these bits at the most significant end of the options word so
# others can be added next to them

ANCHORED = 0x80000000_u32
NO_UTF_CHECK = 0x40000000_u32
ENDANCHORED = 0x20000000_u32

# The following option bits can be passed only to pcre2_compile(). However,
# they may affect compilation, JIT compilation, and/or interpretive execution.
# The following tags indicate which:

# C alters what is compiled by pcre2_compile()
# J alters what is compiled by pcre2_jit_compile()
# M is inspected during pcre2_match() execution
# D is inspected during pcre2_dfa_match() execution

ALLOW_EMPTY_CLASS = 0x00000001_u32 # C
ALT_BSUX = 0x00000002_u32 # C
AUTO_CALLOUT = 0x00000004_u32 # C
CASELESS = 0x00000008_u32 # C
DOLLAR_ENDONLY = 0x00000010_u32 # J M D
DOTALL = 0x00000020_u32 # C
DUPNAMES = 0x00000040_u32 # C
EXTENDED = 0x00000080_u32 # C
FIRSTLINE = 0x00000100_u32 # J M D
MATCH_UNSET_BACKREF = 0x00000200_u32 # C J M
MULTILINE = 0x00000400_u32 # C
NEVER_UCP = 0x00000800_u32 # C
NEVER_UTF = 0x00001000_u32 # C
NO_AUTO_CAPTURE = 0x00002000_u32 # C
NO_AUTO_POSSESS = 0x00004000_u32 # C
NO_DOTSTAR_ANCHOR = 0x00008000_u32 # C
NO_START_OPTIMIZE = 0x00010000_u32 # J M D
UCP = 0x00020000_u32 # C J M D
UNGREEDY = 0x00040000_u32 # C
UTF = 0x00080000_u32 # C J M D
NEVER_BACKSLASH_C = 0x00100000_u32 # C
ALT_CIRCUMFLEX = 0x00200000_u32 # J M D
ALT_VERBNAMES = 0x00400000_u32 # C
USE_OFFSET_LIMIT = 0x00800000_u32 # J M D
EXTENDED_MORE = 0x01000000_u32 # C
LITERAL = 0x02000000_u32 # C
MATCH_INVALID_UTF = 0x04000000_u32 # J M D

# An additional compile options word is available in the compile context.

EXTRA_ALLOW_SURROGATE_ESCAPES = 0x00000001_u32 # C
EXTRA_BAD_ESCAPE_IS_LITERAL = 0x00000002_u32 # C
EXTRA_MATCH_WORD = 0x00000004_u32 # C
EXTRA_MATCH_LINE = 0x00000008_u32 # C
EXTRA_ESCAPED_CR_IS_LF = 0x00000010_u32 # C
EXTRA_ALT_BSUX = 0x00000020_u32 # C
EXTRA_ALLOW_LOOKAROUND_BSK = 0x00000040_u32 # C
EXTRA_CASELESS_RESTRICT = 0x00000080_u32 # C
EXTRA_ASCII_BSD = 0x00000100_u32 # C
EXTRA_ASCII_BSS = 0x00000200_u32 # C
EXTRA_ASCII_BSW = 0x00000400_u32 # C
EXTRA_ASCII_POSIX = 0x00000800_u32 # C

# These are for pcre2_jit_compile().

JIT_COMPLETE = 0x00000001_u32 # For full matching
JIT_PARTIAL_SOFT = 0x00000002_u32
JIT_PARTIAL_HARD = 0x00000004_u32
JIT_INVALID_UTF = 0x00000100_u32

# These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
# pcre2_substitute(). Some are allowed only for one of the functions, and in
# these cases it is noted below. Note that PCRE2_ANCHORED, PCRE2_ENDANCHORED and
# PCRE2_NO_UTF_CHECK can also be passed to these functions (though
# pcre2_jit_match() ignores the latter since it bypasses all sanity checks).

NOTBOL = 0x00000001_u32
NOTEOL = 0x00000002_u32
NOTEMPTY = 0x00000004_u32 # ) These two must be kept
NOTEMPTY_ATSTART = 0x00000008_u32 # ) adjacent to each other.
PARTIAL_SOFT = 0x00000010_u32
PARTIAL_HARD = 0x00000020_u32
DFA_RESTART = 0x00000040_u32 # pcre2_dfa_match() only
DFA_SHORTEST = 0x00000080_u32 # pcre2_dfa_match() only
SUBSTITUTE_GLOBAL = 0x00000100_u32 # pcre2_substitute() only
SUBSTITUTE_EXTENDED = 0x00000200_u32 # pcre2_substitute() only
SUBSTITUTE_UNSET_EMPTY = 0x00000400_u32 # pcre2_substitute() only
SUBSTITUTE_UNKNOWN_UNSET = 0x00000800_u32 # pcre2_substitute() only
SUBSTITUTE_OVERFLOW_LENGTH = 0x00001000_u32 # pcre2_substitute() only
NO_JIT = 0x00002000_u32 # Not for pcre2_dfa_match()
COPY_MATCHED_SUBJECT = 0x00004000_u32
SUBSTITUTE_LITERAL = 0x00008000_u32 # pcre2_substitute() only
SUBSTITUTE_MATCHED = 0x00010000_u32 # pcre2_substitute() only
SUBSTITUTE_REPLACEMENT_ONLY = 0x00020000_u32 # pcre2_substitute() only

enum Error
# "Expected" matching error codes: no match and partial match.
Expand Down Expand Up @@ -185,10 +246,6 @@ lib LibPCRE2
type MatchContext = Void*
fun match_context_create = pcre2_match_context_create_8(gcontext : Void*) : MatchContext*

JIT_COMPLETE = 0x00000001_u32 # For full matching
JIT_PARTIAL_SOFT = 0x00000002_u32
JIT_PARTIAL_HARD = 0x00000004_u32
JIT_INVALID_UTF = 0x00000100_u32
fun jit_compile = pcre2_jit_compile_8(code : Code*, options : UInt32) : Int

type JITStack = Void
Expand Down
58 changes: 47 additions & 11 deletions src/regex/pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Regex::PCRE
source = source.gsub('\u{0}', "\\0")
@source = source

@re = LibPCRE.compile(@source, pcre_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
@re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null?
@extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr)
if @extra.null? && studyerrptr
Expand All @@ -19,19 +19,55 @@ module Regex::PCRE
LibPCRE.full_info(@re, nil, LibPCRE::INFO_CAPTURECOUNT, out @captures)
end

private def pcre_options(options)
private def pcre_compile_options(options)
flag = 0
Regex::Options.each do |option|
if options.includes?(option)
flag |= case option
when .ignore_case? then LibPCRE::CASELESS
when .multiline? then LibPCRE::DOTALL | LibPCRE::MULTILINE
when .extended? then LibPCRE::EXTENDED
when .anchored? then LibPCRE::ANCHORED
when .utf_8? then LibPCRE::UTF8
when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK
when .dupnames? then LibPCRE::DUPNAMES
when .ucp? then LibPCRE::UCP
when .ignore_case? then LibPCRE::CASELESS
when .multiline? then LibPCRE::DOTALL | LibPCRE::MULTILINE
when .dotall? then LibPCRE::DOTALL
when .extended? then LibPCRE::EXTENDED
when .anchored? then LibPCRE::ANCHORED
when .dollar_endonly? then LibPCRE::DOLLAR_ENDONLY
when .firstline? then LibPCRE::FIRSTLINE
when .utf_8? then LibPCRE::UTF8
when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK
when .dupnames? then LibPCRE::DUPNAMES
when .ucp? then LibPCRE::UCP
when .endanchored? then raise ArgumentError.new("Regex::Option::ENDANCHORED is not supported with PCRE")
when .no_jit? then raise ArgumentError.new("Invalid regex option NO_JIT for `pcre_compile`")
else
raise "unreachable"
end
options &= ~option
end
end

# Unnamed values are explicitly used PCRE options, just pass them through:
flag |= options.value

flag
end

private def pcre_match_options(options)
flag = 0
Regex::Options.each do |option|
if options.includes?(option)
flag |= case option
when .ignore_case? then raise ArgumentError.new("Invalid regex option IGNORE_CASE for `pcre_exec`")
when .multiline? then raise ArgumentError.new("Invalid regex option MULTILINE for `pcre_exec`")
when .dotall? then raise ArgumentError.new("Invalid regex option DOTALL for `pcre_exec`")
when .extended? then raise ArgumentError.new("Invalid regex option EXTENDED for `pcre_exec`")
when .anchored? then LibPCRE::ANCHORED
when .dollar_endonly? then raise ArgumentError.new("Invalid regex option DOLLAR_ENDONLY for `pcre_exec`")
when .firstline? then raise ArgumentError.new("Invalid regex option FIRSTLINE for `pcre_exec`")
when .utf_8? then raise ArgumentError.new("Invalid regex option UTF_8 for `pcre_exec`")
when .no_utf8_check? then LibPCRE::NO_UTF8_CHECK
when .dupnames? then raise ArgumentError.new("Invalid regex option DUPNAMES for `pcre_exec`")
when .ucp? then raise ArgumentError.new("Invalid regex option UCP for `pcre_exec`")
when .endanchored? then raise ArgumentError.new("Regex::Option::ENDANCHORED is not supported with PCRE")
when .no_jit? then raise ArgumentError.new("Regex::Option::NO_JIT is not supported with PCRE")
else
raise "unreachable"
end
Expand Down Expand Up @@ -106,7 +142,7 @@ module Regex::PCRE

# Calls `pcre_exec` C function, and handles returning value.
private def internal_matches?(str, byte_index, options, ovector, ovector_size)
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size)
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size)
# TODO: when `ret < -1`, it means PCRE error. It should handle correctly.
ret >= 0
end
Expand Down
Loading

0 comments on commit b31b07d

Please sign in to comment.