Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

std.regex fix .* optimization issues (issue 6072) #66

Merged
merged 2 commits into from
May 31, 2011
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion changelog.dd
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,26 @@ $(VERSION 053, ddd mm, 2011, =================================================,
$(LI Added unsigned to std.traits)
)
$(LIBBUGSFIXED
$(LI $(BUGZILLA 4367): std.regex: Captures is not a random access range)
$(LI $(BUGZILLA 4574): std.regex: breaks with empy string regex)
$(LI $(BUGZILLA 4644): assertExceptionThrown to assert that a particular exception was thrown)
$(LI $(BUGZILLA 4944): Missing tzname even though we have tzset)
$(LI $(BUGZILLA 5019): In std.regex, empty capture at end of string causes error)
$(LI $(BUGZILLA 5169): Add(?:) Non-capturing parentheses group support to std.regex)
$(LI $(BUGZILLA 5451): Three ideas for RedBlackTree)
$(LI $(BUGZILLA 5474): unaryFun byRef is borked for custom parameter name)
$(LI $(BUGZILLA 5485): TLS sections handled incorrectly in FreeBSD)
$(LI $(BUGZILLA 5511): std.regex optional capture with no-match cause error)
$(LI $(BUGZILLA 5616): std.datetime: not cross-platform)
$(LI $(BUGZILLA 5654): BigInt returns ZERO with strings of single digit number with leading zeros)
$(LI $(BUGZILLA 5661): std.algorithm.move does not work on elaborate struct)
$(LI $(BUGZILLA 5661): std.algorithm.move does not work on elaborate struct)
$(LI $(BUGZILLA 5731): std.datetime.SysTime prints UTC offsets backwards)
$(LI $(BUGZILLA 5761): std.datetime: Date.this(int day) conversion fails for Dec 30 of leap years)
$(LI $(BUGZILLA 5780): [patch] std.traits.hasIndirections incorrectly handles static arrays)
$(LI $(BUGZILLA 5781): std.datetime: On Windows, times off by one hour in some years due to DST rule changes)
$(LI $(BUGZILLA 5794): std.datetime StopWatch (and perhaps benchmark) examples need a small fix)
$(LI $(BUGZILLA 5857): std.regex (...){n,m} is bogus when (...) contains repetitions)
$(LI $(BUGZILLA 6076): std.regex: "c.*|d" matches "mm")
)

)
173 changes: 139 additions & 34 deletions std/regex.d
Original file line number Diff line number Diff line change
Expand Up @@ -345,48 +345,149 @@ Returns the number of parenthesized captures
debug(std_regex) writefln("error: %s", msg);
throw new Exception(msg);
}
//adjust jumps, after removing instructions at 'place'
void fixup(ubyte[] prog, size_t place, uint change)
{
for (size_t pc=0;pc<prog.length;)
{
switch (prog[pc])
{
case REend:
return;

case REcounter: //jump forward
if(pc < place)
{
auto dest = cast(uint *)&prog[pc + 1 + uint.sizeof];
if (pc + *dest > place)
*dest -= change;
}
pc += 1 + 2*uint.sizeof;
break;

case REloop, REloopg: //jump back
if (pc > place)
{
auto dest = cast(uint *)&prog[pc + 1 + 2*uint.sizeof];
if (pc + *dest > place)
*dest += change;
}
pc += 1 + 3*uint.sizeof;
break;

case REneglookahead://jump or call forward
case RElookahead:
case REor:
case REgoto:
if (pc < place)
{
auto dest = cast(uint *)&prog[pc+1];
if (pc + *dest > place)
*dest -= change;
}
pc += 1 + uint.sizeof;
break;

case REret:
case REanychar:
case REanystarg:
case REanystar:
case REbol:
case REeol:
case REwordboundary:
case REnotwordboundary:
case REdigit:
case REnotdigit:
case REspace:
case REnotspace:
case REword:
case REnotword:
pc++;
break;

case REchar:
case REichar:
case REbackref:
pc += 2;
break;

case REdchar:
case REidchar:
pc += 1 + dchar.sizeof;
break;

case REstring:
case REistring:
auto len = *cast(size_t *)&prog[pc + 1];
assert(len % E.sizeof == 0);
pc += 1 + size_t.sizeof + len;
break;

case REtestbit:
case REbit:
case REnotbit:
auto pu = cast(ushort *)&prog[pc + 1];
auto len = pu[1];
pc += 1 + 2 * ushort.sizeof + len;
break;

case RErange:
case REnotrange:
auto len = *cast(uint *)&prog[pc + 1];
pc += 1 + uint.sizeof + len;
break;

case REsave:
pc += 1 + uint.sizeof;
break;

default:
writeln("%d",prog[pc]);
assert(0);
}
}
}
//Fixup counter numbers, simplify instructions
private void postprocess(ubyte[] prog)
{
uint counter = 0;
size_t len;
ushort* pu;
nCounters = 0;

size_t pc = 0;
for (;;)
{
switch (prog.front)
switch (prog[pc])
{
case REend:
return;

case REcounter:
size_t offs = 1 + 2*uint.sizeof;
size_t offs = pc + 1 + 2*uint.sizeof;
bool anyloop = counter == 0 && prog[offs] == REanychar
&& (prog[offs+1] == REloop || prog[offs+1] == REloopg);
uint* puint = cast(uint*)&prog[offs+2];
if (anyloop && puint[0] == 0 && puint[1] == inf)
{
prog[0] = prog[offs+1] == REloop ? REanystar : REanystarg;
std.array.replaceInPlace(
prog, 1,
2*(1 + uint.sizeof) + 1 + 3*uint.sizeof,
cast(ubyte[])[]);
prog.popFront();
prog[pc] = prog[offs+1] == REloop ? REanystar : REanystarg;
uint change = 2*(1 + uint.sizeof) + 1 + 3*uint.sizeof - 1;
std.array.replaceInPlace(prog, pc + 1,
pc + change + 1, cast(ubyte[])[]);
fixup(prog, pc, change);
pc++;
}
else
{
*cast(uint*)&prog[1] = counter;
*cast(uint *)&prog[pc+1] = counter;
counter++;
nCounters = max(nCounters, counter);
prog.popFrontN(1 + 2*uint.sizeof);
pc += 1 + 2*uint.sizeof;
}
break;

case REloop, REloopg:
counter--;
prog.popFrontN(1 + 3*uint.sizeof);
pc += 1 + 3*uint.sizeof;
break;

case REret:
Expand All @@ -401,57 +502,52 @@ Returns the number of parenthesized captures
case REnotspace:
case REword:
case REnotword:
prog.popFront();
pc++;
break;

case REbackref:
prog.popFrontN(2);
break;

case REchar:
case REichar:
prog.popFrontN(2);
pc += 2;
break;

case REdchar:
case REidchar:
prog.popFrontN(1+dchar.sizeof);
pc += 1 + dchar.sizeof;
break;

case REstring:
case REistring:
len = *cast(size_t *)&prog[1];
len = *cast(size_t *)&prog[pc+1];
assert(len % E.sizeof == 0);
prog.popFrontN(1 + size_t.sizeof + len);
pc += 1 + size_t.sizeof + len;
break;

case REtestbit:
case REbit:
case REnotbit:
pu = cast(ushort *)&prog[1];
pu = cast(ushort *)&prog[pc+1];
len = pu[1];
prog.popFrontN(1 + 2 * ushort.sizeof + len);
pc += 1 + 2 * ushort.sizeof + len;
break;

case RErange:
case REnotrange:
len = *cast(uint *)&prog[1];
prog.popFrontN(1 + uint.sizeof + len);
len = *cast(uint *)&prog[pc+1];
pc += 1 + uint.sizeof + len;
break;


case REneglookahead:
case RElookahead:
case REor:
case REgoto:
prog.popFrontN(1 + uint.sizeof);
pc += 1 + uint.sizeof;
break;

case REsave:
prog.popFrontN(1 + uint.sizeof);
pc += 1 + uint.sizeof;
break;

case REneglookahead:
case RElookahead:
prog.popFrontN(1 + uint.sizeof);
break;
default:
assert(0);
}
Expand Down Expand Up @@ -1459,7 +1555,7 @@ struct RegexMatch(Range = string)
// Engine
alias .Regex!(Unqual!E) Regex;
private alias Regex.regmatch_t regmatch_t;
enum stackSize = 640*1024;
enum stackSize = 32*1024;
/**
Get or set the engine of the match.
*/
Expand Down Expand Up @@ -1965,7 +2061,7 @@ Returns $(D hit) (converted to $(D string) if necessary).
auto stateSize = (counters.empty ? 0 : (curCounter+1)*uint.sizeof)
+ matchesToSave*regmatch_t.sizeof;
if (memory.length < lastState + stateSize + StateTail.sizeof)
memory.length += memory.length/2; //reallocates on heap
memory.length += memory.length; //reallocates on heap
auto matchPtr = cast(regmatch_t*)&memory[lastState];
matchPtr[0..matchesToSave] = pmatch[1..matchesToSave+1];
if (!counters.empty)
Expand Down Expand Up @@ -2240,7 +2336,7 @@ Returns $(D hit) (converted to $(D string) if necessary).
src = input.length;
else
{
auto p = memchr(&input[src],'\n', input.length-src);
auto p = memchr(input.ptr+src,'\n', input.length-src);
src = p ? p - &input[src] : input.length;
}
while (src > ss)
Expand Down Expand Up @@ -3489,3 +3585,12 @@ unittest
assert(pres == array(retro(heads)));
assert(posts == tails);
}

//issue 6076
//regression on .*
unittest
{
auto re = regex("c.*|d");
auto m = match("mm", re);
assert(m.empty);
}