From d0ec5a7cfec67a08735b720a956c92d1440b3789 Mon Sep 17 00:00:00 2001 From: James Ring Date: Sun, 4 Mar 2018 16:33:28 -0800 Subject: [PATCH] Named group support. This changes the parser to annotate the returned Regexp with information about named capture groups. During compliation, this information is passed along to the RE2 instance and thence to the Matcher. The parser will throw PatternSyntaxException if duplicate group names are specified. --- java/com/google/re2j/Matcher.java | 49 ++++++++++++++++++++++ java/com/google/re2j/Parser.java | 8 ++++ java/com/google/re2j/RE2.java | 3 ++ java/com/google/re2j/Regexp.java | 3 ++ javatests/com/google/re2j/MatcherTest.java | 30 +++++++++++++ javatests/com/google/re2j/ParserTest.java | 2 + 6 files changed, 95 insertions(+) diff --git a/java/com/google/re2j/Matcher.java b/java/com/google/re2j/Matcher.java index eb848610..55b0ac4f 100644 --- a/java/com/google/re2j/Matcher.java +++ b/java/com/google/re2j/Matcher.java @@ -2,6 +2,8 @@ package com.google.re2j; +import java.util.Map; + /** * A stateful iterator that interprets a regex {@code Pattern} on a specific input. Its interface * mimics the JDK 1.4.2 {@code java.util.regex.Matcher}. @@ -37,6 +39,8 @@ public final class Matcher { // The group indexes, in [start, end) pairs. Zeroth pair is overall match. private final int[] groups; + private final Map namedGroups; + // The number of submatches (groups) in the pattern. private final int groupCount; @@ -66,6 +70,7 @@ private Matcher(Pattern pattern) { RE2 re2 = pattern.re2(); groupCount = re2.numberOfCapturingGroups(); groups = new int[2 + 2 * groupCount]; + namedGroups = re2.namedGroups; } /** Creates a new {@code Matcher} with the given pattern and input. */ @@ -137,6 +142,21 @@ public int start(int group) { return groups[2 * group]; } + /** + * Returns the start of the named group of the most recent match, or -1 if the group was not + * matched. + * + * @param group the group name + * @throws IllegalArgumentException if no group with that name exists + */ + public int start(String group) { + Integer g = namedGroups.get(group); + if (g == null) { + throw new IllegalArgumentException("group '" + group + "' not found"); + } + return start(g); + } + /** * Returns the end position of a subgroup of the most recent match. * @@ -149,6 +169,21 @@ public int end(int group) { return groups[2 * group + 1]; } + /** + * Returns the end of the named group of the most recent match, or -1 if the group was not + * matched. + * + * @param group the group name + * @throws IllegalArgumentException if no group with that name exists + */ + public int end(String group) { + Integer g = namedGroups.get(group); + if (g == null) { + throw new IllegalArgumentException("group '" + group + "' not found"); + } + return end(g); + } + /** * Returns the most recent match. * @@ -174,6 +209,20 @@ public String group(int group) { return substring(start, end); } + /** + * Returns the named group of the most recent match, or {@code null} if the group was not matched. + * + * @param group the group name + * @throws IllegalArgumentException if no group with that name exists + */ + public String group(String group) { + Integer g = namedGroups.get(group); + if (g == null) { + throw new IllegalArgumentException("group '" + group + "' not found"); + } + return group(g); + } + /** * Returns the number of subgroups in this pattern. * diff --git a/java/com/google/re2j/Parser.java b/java/com/google/re2j/Parser.java index 2b965052..5cbc8b2d 100644 --- a/java/com/google/re2j/Parser.java +++ b/java/com/google/re2j/Parser.java @@ -13,7 +13,9 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * A parser of regular expression patterns. @@ -38,6 +40,7 @@ class Parser { private static final String ERR_MISSING_REPEAT_ARGUMENT = "missing argument to repetition operator"; private static final String ERR_TRAILING_BACKSLASH = "trailing backslash at end of expression"; + private static final String ERR_DUPLICATE_NAMED_CAPTURE = "duplicate capture group name"; // Hack to expose ArrayList.removeRange(). private static class Stack extends ArrayList { @@ -56,6 +59,7 @@ public void removeRange(int fromIndex, int toIndex) { private final Stack stack = new Stack(); private Regexp free; private int numCap = 0; // number of capturing groups seen + private Map namedGroups = new HashMap(); Parser(String wholeRegexp, int flags) { this.wholeRegexp = wholeRegexp; @@ -972,6 +976,7 @@ private Regexp parseInternal() throws PatternSyntaxException { if (n != 1) { throw new PatternSyntaxException(ERR_MISSING_PAREN, wholeRegexp); } + stack.get(0).namedGroups = namedGroups; return stack.get(0); } @@ -1062,6 +1067,9 @@ private void parsePerlFlags(StringIterator t) throws PatternSyntaxException { // Like ordinary capture, but named. Regexp re = op(Regexp.Op.LEFT_PAREN); re.cap = ++numCap; + if (namedGroups.put(name, numCap) != null) { + throw new PatternSyntaxException(ERR_DUPLICATE_NAMED_CAPTURE, name); + } re.name = name; return; } diff --git a/java/com/google/re2j/RE2.java b/java/com/google/re2j/RE2.java index a20bd4d2..980fbd71 100644 --- a/java/com/google/re2j/RE2.java +++ b/java/com/google/re2j/RE2.java @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Queue; /** @@ -116,6 +117,7 @@ class RE2 { // Accesses must be serialized using |this| monitor. // @GuardedBy("this") private final Queue machine = new ArrayDeque(); + public Map namedGroups; // This is visible for testing. RE2(String expr) { @@ -195,6 +197,7 @@ static RE2 compileImpl(String expr, int mode, boolean longest) throws PatternSyn if (!re2.prefix.isEmpty()) { re2.prefixRune = re2.prefix.codePointAt(0); } + re2.namedGroups = re.namedGroups; return re2; } diff --git a/java/com/google/re2j/Regexp.java b/java/com/google/re2j/Regexp.java index 26f0bbe3..a4f36947 100644 --- a/java/com/google/re2j/Regexp.java +++ b/java/com/google/re2j/Regexp.java @@ -8,6 +8,7 @@ package com.google.re2j; import java.util.Arrays; +import java.util.Map; /** * Regular expression abstract syntax tree. Produced by parser, used by compiler. NB, this @@ -56,6 +57,7 @@ boolean isPseudo() { int min, max; // min, max for REPEAT int cap; // capturing index, for CAPTURE String name; // capturing name, for CAPTURE + Map namedGroups; // map of group name -> capturing index // Do update copy ctor when adding new fields! Regexp(Op op) { @@ -72,6 +74,7 @@ boolean isPseudo() { this.max = that.max; this.cap = that.cap; this.name = that.name; + this.namedGroups = that.namedGroups; } void reinit() { diff --git a/javatests/com/google/re2j/MatcherTest.java b/javatests/com/google/re2j/MatcherTest.java index 7d624e9f..443009ac 100644 --- a/javatests/com/google/re2j/MatcherTest.java +++ b/javatests/com/google/re2j/MatcherTest.java @@ -431,4 +431,34 @@ public void testMutableCharSequence() { b.replace(b.indexOf("ban"), start + 3, "b"); assertTrue(m.find(b.indexOf("ban"))); } + + @Test + public void testNamedGroups() { + Pattern p = + Pattern.compile( + "(?Pf(?Pb*a(?Pr+)){0,10})" + "(?Pbag)?(?Pzzz)?"); + Matcher m = p.matcher("fbbarrrrrbag"); + assertTrue(m.matches()); + assertEquals("fbbarrrrr", m.group("baz")); + assertEquals("bbarrrrr", m.group("foo")); + assertEquals("rrrrr", m.group("another")); + assertEquals(0, m.start("baz")); + assertEquals(1, m.start("foo")); + assertEquals(4, m.start("another")); + assertEquals(9, m.end("baz")); + assertEquals(9, m.end("foo")); + assertEquals("bag", m.group("bag")); + assertEquals(9, m.start("bag")); + assertEquals(12, m.end("bag")); + assertEquals(null, m.group("nomatch")); + assertEquals(-1, m.start("nomatch")); + assertEquals(-1, m.end("nomatch")); + + try { + m.group("nonexistent"); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // Expected + } + } } diff --git a/javatests/com/google/re2j/ParserTest.java b/javatests/com/google/re2j/ParserTest.java index 3536c3ee..2317ee66 100644 --- a/javatests/com/google/re2j/ParserTest.java +++ b/javatests/com/google/re2j/ParserTest.java @@ -528,6 +528,8 @@ private static String runesToString(int[] runes) { "(?i)[a-Z]", "a{100000}", "a{100000,}", + // Group names may not be repeated + "(?Pbar)(?Pbaz)", }; private static final String[] ONLY_PERL = {