Skip to content

Commit

Permalink
Merge pull request #4232 from DmitryOlshansky/regex-inline-flags
Browse files Browse the repository at this point in the history
[std.regex] Support for inline flags
  • Loading branch information
9il committed Apr 30, 2016
2 parents 0bd8254 + 680f690 commit f6f61f9
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 33 deletions.
48 changes: 37 additions & 11 deletions std/regex/internal/backtracking.d
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,7 @@ template BacktrackingMatcher(bool CTregex)
next();
break;
case IR.Any:
if (atEnd || (!(re.flags & RegexOption.singleline)
&& (front == '\r' || front == '\n')))
if (atEnd)
goto L_backtrack;
pc += IRL!(IR.Any);
next();
Expand Down Expand Up @@ -373,27 +372,37 @@ template BacktrackingMatcher(bool CTregex)
}
pc += IRL!(IR.Wordboundary);
break;
case IR.Bof:
if (atStart)
pc += IRL!(IR.Bol);
else
goto L_backtrack;
break;
case IR.Bol:
dchar back;
DataIndex bi;
if (atStart)
pc += IRL!(IR.Bol);
else if ((re.flags & RegexOption.multiline)
&& s.loopBack(index).nextChar(back,bi)
else if (s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n'))
{
pc += IRL!(IR.Bol);
}
else
goto L_backtrack;
break;
case IR.Eof:
if (atEnd)
pc += IRL!(IR.Eol);
else
goto L_backtrack;
break;
case IR.Eol:
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
//no matching inside \r\n
if (atEnd || ((re.flags & RegexOption.multiline)
&& endOfLine(front, s.loopBack(index).nextChar(back,bi)
if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
pc += IRL!(IR.Eol);
Expand Down Expand Up @@ -1333,8 +1342,7 @@ struct CtContext
code ~= ctSub(`
dchar back;
DataIndex bi;
if (atStart || ((re.flags & RegexOption.multiline)
&& s.loopBack(index).nextChar(back,bi)
if (atStart || (s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n')))
{
debug(std_regex_matcher) writeln("BOL matched");
Expand All @@ -1344,22 +1352,40 @@ struct CtContext
$$`, nextInstr, bailOut);

break;
case IR.Bof:
code ~= ctSub(`
if (atStart)
{
debug(std_regex_matcher) writeln("BOF matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.Eol:
code ~= ctSub(`
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
//no matching inside \r\n
if (atEnd || ((re.flags & RegexOption.multiline)
&& endOfLine(front, s.loopBack(index).nextChar(back,bi)
if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
debug(std_regex_matcher) writeln("EOL matched");
$$
}
else
$$`, nextInstr, bailOut);

break;
case IR.Eof:
code ~= ctSub(`
if (atEnd)
{
debug(std_regex_matcher) writeln("BOF matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.GroupStart:
code ~= ctSub(`
Expand Down
13 changes: 7 additions & 6 deletions std/regex/internal/ir.d
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,17 @@ enum IR:uint {
OrChar = 0b1_00100_00,
Nop = 0b1_00101_00, //no operation (padding)
End = 0b1_00110_00, //end of program
Bol = 0b1_00111_00, //beginning of a string ^
Eol = 0b1_01000_00, //end of a string $
Bol = 0b1_00111_00, //beginning of a line ^
Eol = 0b1_01000_00, //end of a line $
Wordboundary = 0b1_01001_00, //boundary of a word
Notwordboundary = 0b1_01010_00, //not a word boundary
Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
Option = 0b1_01110_00, //start of an option within an alternation x | y (length)
GotoEndOr = 0b1_01111_00, //end of an option (length of the rest)
Bof = 0b1_10000_00, //begining of "file" (string) ^
Eof = 0b1_10001_00, //end of "file" (string) $
//... any additional atoms here

OrStart = 0b1_00000_01, //start of alternation group (length)
Expand Down Expand Up @@ -531,17 +533,16 @@ package(std.regex):
//check if searching is not needed
void checkIfOneShot()
{
if (flags & RegexOption.multiline)
return;
L_CheckLoop:
for (uint i = 0; i < ir.length; i += ir[i].length)
{
switch (ir[i].code)
{
case IR.Bol:
case IR.Bof:
flags |= RegexInfo.oneShot;
break L_CheckLoop;
case IR.GroupStart, IR.GroupEnd, IR.Eol, IR.Wordboundary, IR.Notwordboundary:
case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
IR.Wordboundary, IR.Notwordboundary:
break;
default:
break L_CheckLoop;
Expand Down
4 changes: 2 additions & 2 deletions std/regex/internal/kickstart.d
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public:
case IR.GroupStart, IR.GroupEnd:
i += IRL!(IR.GroupStart);
break;
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary:
i += IRL!(IR.Bol);
break;
default:
Expand Down Expand Up @@ -357,7 +357,7 @@ public:
case IR.GroupStart, IR.GroupEnd:
t.pc += IRL!(IR.GroupStart);
break;
case IR.Bol, IR.Wordboundary, IR.Notwordboundary:
case IR.Bof, IR.Bol, IR.Wordboundary, IR.Notwordboundary:
t.pc += IRL!(IR.Bol);
break;
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
Expand Down
62 changes: 58 additions & 4 deletions std/regex/internal/parser.d
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,49 @@ struct Parser(R, Generator)
next();
break;
default:
error(" ':', '=', '<', 'P' or '!' expected after '(?' ");
uint enableFlags, disableFlags;
bool enable = true;
do
{
switch(current)
{
case 's':
if (enable)
enableFlags |= RegexOption.singleline;
else
disableFlags |= RegexOption.singleline;
break;
case 'x':
if (enable)
enableFlags |= RegexOption.freeform;
else
disableFlags |= RegexOption.freeform;
break;
case 'i':
if (enable)
enableFlags |= RegexOption.casefold;
else
disableFlags |= RegexOption.casefold;
break;
case 'm':
if (enable)
enableFlags |= RegexOption.multiline;
else
disableFlags |= RegexOption.multiline;
break;
case '-':
if (!enable)
error(" unexpected second '-' in flags");
enable = false;
break;
default:
error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
}
next();
}while (current != ')');
next();
re_flags |= enableFlags;
re_flags &= ~disableFlags;
}
}
else
Expand Down Expand Up @@ -885,7 +927,13 @@ struct Parser(R, Generator)
error("'*', '+', '?', '{', '}' not allowed in atom");
break;
case '.':
g.put(Bytecode(IR.Any, 0));
if (re_flags & RegexOption.singleline)
g.put(Bytecode(IR.Any, 0));
else
{
CodepointSet set;
g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
}
next();
break;
case '[':
Expand All @@ -896,11 +944,17 @@ struct Parser(R, Generator)
parseEscape();
break;
case '^':
g.put(Bytecode(IR.Bol, 0));
if (re_flags & RegexOption.multiline)
g.put(Bytecode(IR.Bol, 0));
else
g.put(Bytecode(IR.Bof, 0));
next();
break;
case '$':
g.put(Bytecode(IR.Eol, 0));
if (re_flags & RegexOption.multiline)
g.put(Bytecode(IR.Eol, 0));
else
g.put(Bytecode(IR.Eof, 0));
next();
break;
default:
Expand Down
5 changes: 5 additions & 0 deletions std/regex/internal/tests.d
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,11 @@ unittest
//mixed lookaround
TestVectors( `a(?<=a(?=b))b`, "ab", "y", "$&", "ab"),
TestVectors( `a(?<=a(?!b))c`, "ac", "y", "$&", "ac"),
TestVectors( `a(?i)bc`, "aBc", "y", "$&", "aBc"),
TestVectors( `a(?i)bc`, "Abc", "n", "$&", "-"),
TestVectors( `(?i)a(?-i)bc`, "aBcAbc", "y", "$&", "Abc"),
TestVectors( `(?s).(?-s).`, "\n\n\na", "y", "$&", "\na"),
TestVectors( `(?m)^a(?-m)$`, "\na", "y", "$&", "a")
];
string produceExpected(M,String)(auto ref M m, String fmt)
{
Expand Down
45 changes: 35 additions & 10 deletions std/regex/internal/thompson.d
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,30 @@ template ThompsonOps(E, S, bool withInput:true)
return true;
}

static bool op(IR code:IR.Bof)(E* e, S* state)
{
with(e) with(state)
{
if (atStart)
{
t.pc += IRL!(IR.Bof);
return true;
}
else
{
return popState(e);
}
}
}

static bool op(IR code:IR.Bol)(E* e, S* state)
{
with(e) with(state)
{
dchar back;
DataIndex bi;
if (atStart
||( (re.flags & RegexOption.multiline)
&& s.loopBack(index).nextChar(back,bi)
||(s.loopBack(index).nextChar(back,bi)
&& startOfLine(back, front == '\n')))
{
t.pc += IRL!(IR.Bol);
Expand All @@ -188,16 +203,30 @@ template ThompsonOps(E, S, bool withInput:true)
}
}

static bool op(IR code:IR.Eof)(E* e, S* state)
{
with(e) with(state)
{
if (atEnd)
{
t.pc += IRL!(IR.Eol);
return true;
}
else
{
return popState(e);
}
}
}

static bool op(IR code:IR.Eol)(E* e, S* state)
{
with(e) with(state)
{
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index..s.lastIndex]);
dchar back;
DataIndex bi;
//no matching inside \r\n
if (atEnd || ((re.flags & RegexOption.multiline)
&& endOfLine(front, s.loopBack(index).nextChar(back, bi)
if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi)
&& back == '\r')))
{
t.pc += IRL!(IR.Eol);
Expand Down Expand Up @@ -598,11 +627,7 @@ template ThompsonOps(E, S, bool withInput:true)
with(e) with(state)
{
t.pc += IRL!(IR.Any);
if (!(re.flags & RegexOption.singleline)
&& (front == '\r' || front == '\n'))
recycle(t);
else
nlist.insertBack(t);
nlist.insertBack(t);
t = worklist.fetch();
return t != null;
}
Expand Down

0 comments on commit f6f61f9

Please sign in to comment.