233 changes: 134 additions & 99 deletions std/regex/internal/backtracking.d

Large diffs are not rendered by default.

741 changes: 741 additions & 0 deletions std/regex/internal/bitnfa.d

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions std/regex/internal/generator.d
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ module std.regex.internal.generator;
case IR.CodepointSet:
case IR.Trie:
auto set = re.charsets[re.ir[pc].data];
auto x = rand(cast(uint)set.byInterval.length);
auto y = rand(set.byInterval[x].b - set.byInterval[x].a);
formattedWrite(app, "%s", cast(dchar)(set.byInterval[x].a+y));
auto x = rand(cast(uint)set.length);
auto y = rand(set[x].b - set[x].a);
formattedWrite(app, "%s", cast(dchar)(set[x].a+y));
pc += IRL!(IR.CodepointSet);
break;
case IR.Any:
Expand Down
220 changes: 168 additions & 52 deletions std/regex/internal/ir.d
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module std.regex.internal.ir;

package(std.regex):

import std.exception, std.uni, std.meta, std.traits, std.range.primitives;
import std.exception, std.uni, std.meta, std.traits, std.typecons, std.range.primitives;

debug(std_regex_parser) import std.stdio;
// just a common trait, may be moved elsewhere
Expand All @@ -28,25 +28,6 @@ alias makeTrie = codepointSetTrie!(13, 8);

CharMatcher[CodepointSet] matcherCache;

//accessor with caching
@trusted CharMatcher getMatcher(CodepointSet set)
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
if (__ctfe || maxCachedMatchers == 0)
return CharMatcher(set);
else
{
auto p = set in matcherCache;
if (p)
return *p;
if (matcherCache.length == maxCachedMatchers)
{
// flush enmatchers in trieCache
matcherCache = null;
}
return (matcherCache[set] = CharMatcher(set));
}
}

@trusted auto memoizeExpr(string expr)()
{
if (__ctfe)
Expand All @@ -63,17 +44,30 @@ CharMatcher[CodepointSet] matcherCache;
}

//property for \w character class
@property CodepointSet wordCharacter()
@property CodepointSet wordCharacter() pure
{
return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
| unicode.Me | unicode.Nd | unicode.Pc")();
return unicode.Alphabetic | unicode.Mn | unicode.Mc
| unicode.Me | unicode.Nd | unicode.Pc;
}

@property CharMatcher wordMatcher()
{
return memoizeExpr!("CharMatcher(wordCharacter)")();
}

package bool scanFor()(const(Interval)[] ivals, dchar ch)
{
immutable len = ivals.length;
for (size_t i = 0; i < len; i++)
{
if (ch < ivals[i].a)
return false;
if (ch < ivals[i].b)
return true;
}
return false;
}

// some special Unicode white space characters
private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';

Expand All @@ -97,7 +91,7 @@ enum RegexOption: uint {
//do not reorder this list
alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
static assert( RegexOption.max < 0x80);
// flags that allow guide execution of engine
// flags that guide execution of engine
enum RegexInfo : uint { oneShot = 0x80 }

// IR bit pattern: 0b1_xxxxx_yy
Expand Down Expand Up @@ -173,7 +167,8 @@ template IRL(IR code)
static assert (IRL!(IR.LookaheadStart) == 3);

//how many parameters follow the IR, should be optimized fixing some IR bits
int immediateParamsIR(IR i){
int immediateParamsIR(IR i) pure
{
switch (i)
{
case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
Expand All @@ -190,43 +185,43 @@ int immediateParamsIR(IR i){
}

//full length of IR instruction inlcuding all parameters that might follow it
int lengthOfIR(IR i)
int lengthOfIR(IR i) pure
{
return 1 + immediateParamsIR(i);
}

//full length of the paired IR instruction inlcuding all parameters that might follow it
int lengthOfPairedIR(IR i)
int lengthOfPairedIR(IR i) pure
{
return 1 + immediateParamsIR(pairedIR(i));
}

//if the operation has a merge point (this relies on the order of the ops)
bool hasMerge(IR i)
bool hasMerge(IR i) pure
{
return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
}

//is an IR that opens a "group"
bool isStartIR(IR i)
bool isStartIR(IR i) pure
{
return (i&0b11)==0b01;
}

//is an IR that ends a "group"
bool isEndIR(IR i)
bool isEndIR(IR i) pure
{
return (i&0b11)==0b10;
}

//is a standalone IR
bool isAtomIR(IR i)
bool isAtomIR(IR i) pure
{
return (i&0b11)==0b00;
}

//makes respective pair out of IR i, swapping start/end bits of instruction
IR pairedIR(IR i)
IR pairedIR(IR i) pure
{
assert(isStartIR(i) || isEndIR(i));
return cast(IR)(i ^ 0b11);
Expand All @@ -235,6 +230,7 @@ IR pairedIR(IR i)
//encoded IR instruction
struct Bytecode
{
pure:
uint raw;
//natural constraints
enum maxSequence = 2+4;
Expand Down Expand Up @@ -381,7 +377,7 @@ struct Group(DataIndex)
}

//debugging tool, prints out instruction along with opcodes
@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
@trusted pure string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
{
import std.array : appender;
import std.format : formattedWrite;
Expand Down Expand Up @@ -452,6 +448,129 @@ struct Group(DataIndex)
writeln("\t", disassemble(slice, pc, dict));
}

/+
Generic interface for kickstart engine components.
The goal of kickstart is to advance input to the next potential match,
the more accurate & fast the better.
+/
interface Kickstart(Char){
@trusted:
bool search(ref Input!Char input) const;
bool match(ref Input!Char input) const;
@property bool empty() const pure;
}

//basic stack, just in case it gets used anywhere else then Parser
@trusted struct Stack(T)
{
pure:
T[] data;
@property bool empty(){ return data.empty; }

@property size_t length(){ return data.length; }

void push(T val){ data ~= val; }

T pop()
{
assert(!empty);
auto val = data[$ - 1];
data = data[0 .. $ - 1];
//if (!__ctfe)
// cast(void)data.assumeSafeAppend();
return val;
}

@property ref T top()
{
assert(!empty);
return data[$ - 1];
}
}

@trusted void reverseBytecode()(Bytecode[] code) pure
{
import std.typecons;
Bytecode[] rev = new Bytecode[code.length];
uint revPc = cast(uint)rev.length;
Stack!(Tuple!(uint, uint, uint)) stack;
uint start = 0;
uint end = cast(uint)code.length;
for (;;)
{
for (uint pc = start; pc < end; )
{
uint len = code[pc].length;
if (code[pc].code == IR.GotoEndOr)
break; //pick next alternation branch
if (code[pc].isAtom)
{
rev[revPc - len .. revPc] = code[pc .. pc + len];
revPc -= len;
pc += len;
}
else if (code[pc].isStart || code[pc].isEnd)
{
//skip over other embedded lookbehinds they are reversed
if (code[pc].code == IR.LookbehindStart
|| code[pc].code == IR.NeglookbehindStart)
{
uint blockLen = len + code[pc].data
+ code[pc].pairedLength;
rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
pc += blockLen;
revPc -= blockLen;
continue;
}
uint second = code[pc].indexOfPair(pc);
uint secLen = code[second].length;
rev[revPc - secLen .. revPc] = code[second .. second + secLen];
revPc -= secLen;
if (code[pc].code == IR.OrStart)
{
//we pass len bytes forward, but secLen in reverse
uint revStart = revPc - (second + len - secLen - pc);
uint r = revStart;
uint i = pc + IRL!(IR.OrStart);
while (code[i].code == IR.Option)
{
if (code[i - 1].code != IR.OrStart)
{
assert(code[i - 1].code == IR.GotoEndOr);
rev[r - 1] = code[i - 1];
}
rev[r] = code[i];
auto newStart = i + IRL!(IR.Option);
auto newEnd = newStart + code[i].data;
auto newRpc = r + code[i].data + IRL!(IR.Option);
if (code[newEnd].code != IR.OrEnd)
{
newRpc--;
}
stack.push(tuple(newStart, newEnd, newRpc));
r += code[i].data + IRL!(IR.Option);
i += code[i].data + IRL!(IR.Option);
}
pc = i;
revPc = revStart;
assert(code[pc].code == IR.OrEnd);
}
else
pc += len;
}
}
if (stack.empty)
break;
start = stack.top[0];
end = stack.top[1];
revPc = stack.top[2];
stack.pop();
}
code[] = rev[];
}

package alias Interval = Tuple!(uint,"a",uint, "b");

/++
$(D Regex) object holds regular expression pattern in compiled form.
Instances of this object are constructed via calls to $(D regex).
Expand All @@ -460,11 +579,7 @@ struct Group(DataIndex)
+/
struct Regex(Char)
{
//temporary workaround for identifier lookup
CodepointSet[] charsets; //
Bytecode[] ir; //compiled bytecode of pattern


pure:
@safe @property bool empty() const nothrow { return ir is null; }

@safe @property auto namedCaptures()
Expand Down Expand Up @@ -513,15 +628,16 @@ struct Regex(Char)
}

package(std.regex):
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
Bytecode[] ir; // compiled bytecode of pattern
NamedGroup[] dict; // maps name -> user group number
uint ngroup; // number of internal groups
uint maxCounterDepth; // max depth of nested {n,m} repetitions
uint hotspotTableSize; // number of entries in merge table
uint threadCount; // upper bound on number of Thompson VM threads
uint flags; // global regex flags
public const(CharMatcher)[] matchers; // tables that represent character sets
public const(BitTable)[] filters; // bloom filters for conditional loops
Interval[][] charsets; // intervals of characters
const(CharMatcher)[] matchers; // tables that represent character sets
const(BitTable)[] filters; // bloom filters for conditional loops
uint[] backrefed; // bit array of backreferenced submatches
Kickstart!Char kickstart;

Expand Down Expand Up @@ -558,10 +674,10 @@ package(std.regex):
{//@@@BUG@@@ write is system
for (uint i = 0; i < ir.length; i += ir[i].length)
{
writefln("%d\t%s ", i, disassemble(ir, i, dict));
debug(std_regex_parser) writefln("%d\t%s ", i, disassemble(ir, i, dict));
}
writeln("Total merge table size: ", hotspotTableSize);
writeln("Max counter nesting depth: ", maxCounterDepth);
debug(std_regex_parser) writeln("Total merge table size: ", hotspotTableSize);
debug(std_regex_parser) writeln("Max counter nesting depth: ", maxCounterDepth);
}

}
Expand All @@ -577,11 +693,10 @@ package(std.regex):
public:
Regex!Char _regex;
alias _regex this;
this(Regex!Char re, MatchFn fn)
this(immutable Regex!Char re, MatchFn fn) immutable
{
_regex = re;
nativeFn = fn;

}

}
Expand Down Expand Up @@ -622,10 +737,10 @@ struct Input(Char)
@property bool atEnd(){
return _index == _origin.length;
}
bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos)

bool search(const Kickstart!Char kick, ref dchar res, ref size_t pos)
{
size_t idx = kick.search(_origin, _index);
_index = idx;
kick.search(this);
return nextChar(res, pos);
}

Expand Down Expand Up @@ -705,8 +820,8 @@ template BackLooper(E)
}

//
@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name)
{//equal is @system?
@safe uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
{
import std.range : assumeSorted;
import std.conv : text;
import std.algorithm.iteration : map;
Expand Down Expand Up @@ -742,6 +857,7 @@ public class RegexException : Exception

// simple 128-entry bit-table used with a hash function
struct BitTable {
pure:
uint[4] filter;

this(CodepointSet set){
Expand Down Expand Up @@ -770,7 +886,7 @@ struct BitTable {
struct CharMatcher {
BitTable ascii; // fast path for ASCII
Trie trie; // slow path for Unicode

pure:
this(CodepointSet set)
{
auto asciiSet = set & unicode.ASCII;
Expand Down
154 changes: 30 additions & 124 deletions std/regex/internal/parser.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
*/
module std.regex.internal.parser;

import std.regex.internal.ir;
import std.regex.internal.ir, std.regex.internal.shiftor,
std.regex.internal.bitnfa;
import std.range.primitives, std.uni, std.meta,
std.traits, std.typecons, std.exception;
std.traits, std.typecons, std.exception, std.range;
static import std.ascii;

// package relevant info from parser into a regex object
auto makeRegex(S, CG)(Parser!(S, CG) p)
auto makeRegex(S, CG)(Parser!(S, CG) p) pure
{
Regex!(BasicElementOf!S) re;
auto g = p.g;
Expand All @@ -21,7 +22,10 @@ auto makeRegex(S, CG)(Parser!(S, CG) p)
ngroup = g.ngroup;
maxCounterDepth = g.counterDepth;
flags = p.re_flags;
charsets = g.charsets;
charsets = g.charsets
.map!(x =>
x.byInterval.map!(x=>Interval(x.a,x.b)).array
).array;
matchers = g.matchers;
backrefed = g.backrefed;
re.postprocess();
Expand Down Expand Up @@ -76,87 +80,6 @@ unittest
assert(nc.equal(cp[1 .. $ - 1]));
}


@trusted void reverseBytecode()(Bytecode[] code)
{
Bytecode[] rev = new Bytecode[code.length];
uint revPc = cast(uint)rev.length;
Stack!(Tuple!(uint, uint, uint)) stack;
uint start = 0;
uint end = cast(uint)code.length;
for (;;)
{
for (uint pc = start; pc < end; )
{
immutable len = code[pc].length;
if (code[pc].code == IR.GotoEndOr)
break; //pick next alternation branch
if (code[pc].isAtom)
{
rev[revPc - len .. revPc] = code[pc .. pc + len];
revPc -= len;
pc += len;
}
else if (code[pc].isStart || code[pc].isEnd)
{
//skip over other embedded lookbehinds they are reversed
if (code[pc].code == IR.LookbehindStart
|| code[pc].code == IR.NeglookbehindStart)
{
immutable blockLen = len + code[pc].data
+ code[pc].pairedLength;
rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
pc += blockLen;
revPc -= blockLen;
continue;
}
immutable second = code[pc].indexOfPair(pc);
immutable secLen = code[second].length;
rev[revPc - secLen .. revPc] = code[second .. second + secLen];
revPc -= secLen;
if (code[pc].code == IR.OrStart)
{
//we pass len bytes forward, but secLen in reverse
immutable revStart = revPc - (second + len - secLen - pc);
uint r = revStart;
uint i = pc + IRL!(IR.OrStart);
while (code[i].code == IR.Option)
{
if (code[i - 1].code != IR.OrStart)
{
assert(code[i - 1].code == IR.GotoEndOr);
rev[r - 1] = code[i - 1];
}
rev[r] = code[i];
auto newStart = i + IRL!(IR.Option);
auto newEnd = newStart + code[i].data;
auto newRpc = r + code[i].data + IRL!(IR.Option);
if (code[newEnd].code != IR.OrEnd)
{
newRpc--;
}
stack.push(tuple(newStart, newEnd, newRpc));
r += code[i].data + IRL!(IR.Option);
i += code[i].data + IRL!(IR.Option);
}
pc = i;
revPc = revStart;
assert(code[pc].code == IR.OrEnd);
}
else
pc += len;
}
}
if (stack.empty)
break;
start = stack.top[0];
end = stack.top[1];
revPc = stack.top[2];
stack.pop();
}
code[] = rev[];
}

//test if a given string starts with hex number of maxDigit that's a valid codepoint
//returns it's value and skips these maxDigit chars on success, throws on failure
dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
Expand All @@ -181,7 +104,7 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
return val;
}

@system unittest //BUG canFind is system
@safe unittest
{
import std.algorithm.searching : canFind;
string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
Expand Down Expand Up @@ -211,7 +134,7 @@ auto caseEnclose(CodepointSet set)
/+
fetch codepoint set corresponding to a name (InBlock or binary property)
+/
@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold)
@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) pure
{
CodepointSet s = unicode(name);
//FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
Expand All @@ -222,35 +145,9 @@ auto caseEnclose(CodepointSet set)
return s;
}

//basic stack, just in case it gets used anywhere else then Parser
@trusted struct Stack(T)
{
T[] data;
@property bool empty(){ return data.empty; }

@property size_t length(){ return data.length; }

void push(T val){ data ~= val; }

T pop()
{
assert(!empty);
auto val = data[$ - 1];
data = data[0 .. $ - 1];
if (!__ctfe)
cast(void)data.assumeSafeAppend();
return val;
}

@property ref T top()
{
assert(!empty);
return data[$ - 1];
}
}

struct CodeGen
{
pure:
Bytecode[] ir; // resulting bytecode
Stack!(uint) fixupStack; // stack of opened start instructions
NamedGroup[] dict; // maps name -> user group number
Expand Down Expand Up @@ -335,7 +232,7 @@ struct CodeGen
}
if (ivals.length*2 > maxCharsetUsed)
{
auto t = getMatcher(set);
auto t = CharMatcher(set);
put(Bytecode(IR.Trie, cast(uint)matchers.length));
matchers ~= t;
debug(std_regex_allocation) writeln("Trie generated");
Expand Down Expand Up @@ -616,6 +513,7 @@ enum infinite = ~0u;
struct Parser(R, Generator)
if (isForwardRange!R && is(ElementType!R : dchar))
{
pure:
dchar _current;
bool empty;
R pat, origin; //keep full pattern for pretty printing error messages
Expand Down Expand Up @@ -728,8 +626,6 @@ struct Parser(R, Generator)

while (!empty)
{
debug(std_regex_parser)
__ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
switch (current)
{
case '(':
Expand Down Expand Up @@ -1484,11 +1380,13 @@ struct Parser(R, Generator)
if (current >= privateUseStart && current <= privateUseEnd)
{
g.endPattern(current - privateUseStart + 1);
break;
}
auto op = Bytecode(IR.Char, current);
else
{
auto op = Bytecode(IR.Char, current);
g.put(op);
}
next();
g.put(op);
}
}

Expand Down Expand Up @@ -1542,7 +1440,7 @@ struct Parser(R, Generator)
/+
Postproces the IR, then optimize.
+/
@trusted void postprocess(Char)(ref Regex!Char zis)
@trusted void postprocess(Char)(ref Regex!Char zis) pure
{//@@@BUG@@@ write is @system
with(zis)
{
Expand Down Expand Up @@ -1604,7 +1502,15 @@ struct Parser(R, Generator)
}
checkIfOneShot();
if (!(flags & RegexInfo.oneShot))
kickstart = Kickstart!Char(zis, new uint[](256));
{
kickstart = new ShiftOr!Char(zis);
if (kickstart.empty)
{
kickstart = new BitMatcher!Char(zis);
if (kickstart.empty)
kickstart = null;
}
}
debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
optimize(zis);
}
Expand Down Expand Up @@ -1654,7 +1560,7 @@ void fixupBytecode()(Bytecode[] ir)
assert(fixups.empty);
}

void optimize(Char)(ref Regex!Char zis)
void optimize(Char)(ref Regex!Char zis) pure
{
import std.array : insertInPlace;
CodepointSet nextSet(uint idx)
Expand All @@ -1671,7 +1577,7 @@ void optimize(Char)(ref Regex!Char zis)
goto default;
//TODO: OrChar
case Trie, CodepointSet:
set = zis.charsets[ir[i].data];
set = .CodepointSet(zis.charsets[ir[i].data]);
goto default;
case GroupStart,GroupEnd:
break;
Expand Down
207 changes: 105 additions & 102 deletions std/regex/internal/kickstart.d → std/regex/internal/shiftor.d
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*
Kickstart is a coarse-grained "filter" engine that finds likely matches
to be verified by full-blown matcher.
ShiftOr is a kickstart engine, a coarse-grained "filter" engine that finds
potential matches to be verified by a full-blown matcher.
*/
module std.regex.internal.kickstart;
module std.regex.internal.shiftor;

package(std.regex):

Expand All @@ -26,9 +26,10 @@ uint effectiveSize(Char)()
Kickstart engine using ShiftOr algorithm,
a bit parallel technique for inexact string searching.
*/
struct ShiftOr(Char)
class ShiftOr(Char) : Kickstart!Char
{
private:
pure:
uint[] table;
uint fChar;
uint n_length;
Expand Down Expand Up @@ -115,8 +116,8 @@ private:
{
auto t = worklist[$-1];
worklist.length -= 1;
if (!__ctfe)
cast(void)worklist.assumeSafeAppend();
//if (!__ctfe)
// cast(void)worklist.assumeSafeAppend();
return t;
}

Expand All @@ -127,13 +128,13 @@ private:
}

public:
@trusted this(ref Regex!Char re, uint[] memory)
@trusted this(ref Regex!Char re)
{
static import std.algorithm.comparison;
import std.algorithm.searching : countUntil;
import std.conv : text;
import std.range : assumeSorted;
assert(memory.length == 256);
uint[] memory = new uint[256];
fChar = uint.max;
// FNV-1a flavored hash (uses 32bits at a time)
ulong hash(uint[] tab)
Expand Down Expand Up @@ -241,9 +242,9 @@ public:
static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF];
else //== 2
static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF];
uint[] arr = new uint[set.byInterval.length * 2];
uint[] arr = new uint[set.length * 2];
size_t ofs = 0;
foreach (ival; set.byInterval)
foreach (ival; set)
{
arr[ofs++] = ival.a;
arr[ofs++] = ival.b;
Expand All @@ -262,7 +263,8 @@ public:
auto chars = set.length;
if (chars > charsetThreshold)
goto L_StopThread;
foreach (ch; set.byCodepoint)
foreach (ival; set)
foreach (ch; ival.a..ival.b)
{
//avoid surrogate pairs
if (0xD800 <= ch && ch <= 0xDFFF)
Expand Down Expand Up @@ -339,25 +341,6 @@ public:
t.pc += IRL!(IR.RepeatEnd);
}
break;
case IR.InfiniteStart, IR.InfiniteQStart:
t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart);
goto case IR.InfiniteEnd; //both Q and non-Q
case IR.InfiniteEnd:
case IR.InfiniteQEnd:
auto slot = re.ir[t.pc+1].raw+t.counter;
auto val = hash(t.tab);
if (val in merge[slot])
goto L_StopThread; // merge equivalent
merge[slot][val] = true;
uint len = re.ir[t.pc].data;
uint pc1, pc2; //branches to take in priority order
if (++t.hops == 32)
goto L_StopThread;
pc1 = t.pc + IRL!(IR.InfiniteEnd);
pc2 = t.pc - len;
trs ~= fork(t, pc2, t.counter);
t.pc = pc1;
break;
case IR.GroupStart, IR.GroupEnd:
t.pc += IRL!(IR.GroupStart);
break;
Expand All @@ -370,7 +353,6 @@ public:
default:
L_StopThread:
assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code));
debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic);
n_length = std.algorithm.comparison.min(t.idx, n_length);
break L_Eval_Thread;
}
Expand All @@ -385,22 +367,23 @@ public:
}
}

@property bool empty() const { return n_length == 0; }
final @property bool empty() const { return n_length < 3 && fChar == uint.max; }

@property uint length() const{ return n_length/charSize; }
final @property uint length() const{ return n_length/charSize; }

// lookup compatible bit pattern in haystack, return starting index
// has a useful trait: if supplied with valid UTF indexes,
// returns only valid UTF indexes
// (that given the haystack in question is valid UTF string)
@trusted size_t search(const(Char)[] haystack, size_t idx)
final @trusted bool search(ref Input!Char s) const
{//@BUG: apparently assumes little endian machines
import std.conv : text;
import core.stdc.string : memchr;
assert(!empty);
auto p = cast(const(ubyte)*)(haystack.ptr+idx);
auto haystack = s._origin;
uint state = uint.max;
uint limit = 1u<<(n_length - 1u);
auto p = cast(const(ubyte)*)(haystack.ptr+s._index);
debug(std_regex_search) writefln("Limit: %32b",limit);
if (fChar != uint.max)
{
Expand All @@ -415,11 +398,17 @@ public:
assert(p <= end, text(p," vs ", end));
p = cast(ubyte*)memchr(p, fChar, end - p);
if (!p)
return haystack.length;
{
s._index = haystack.length;
return false;
}
if ((cast(size_t)p & (Char.sizeof-1)) == orginalAlign)
break;
if (++p == end)
return haystack.length;
{
s._index = haystack.length;
return false;
}
}
state = ~1u;
assert((cast(size_t)p & (Char.sizeof-1)) == orginalAlign);
Expand All @@ -433,8 +422,10 @@ public:
p++;
//first char is tested, see if that's all
if (!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
{
s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length;
return true;
}
}
else
{//have some bits/states for possible matches,
Expand All @@ -452,8 +443,10 @@ public:
p++;
}
if (!(state & limit))
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
{
s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length;
return true;
}
}
debug(std_regex_search) writefln("State: %32b", state);
}
Expand All @@ -471,8 +464,10 @@ public:
state = (state<<1) | table[p[2]];
p += 4;
if (!(state & limit))//division rounds down for dchar
return (p-cast(ubyte*)haystack.ptr)/Char.sizeof
-length;
{
s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length;
return true;
}
}
}
else
Expand All @@ -483,97 +478,105 @@ public:
{
state = (state<<1) | table[p[i++]];
if (!(state & limit))
return idx+i/Char.sizeof-length;
{
s._index += i/Char.sizeof-length;
return true;
}
}
while (i < len)
{
state = (state<<1) | table[p[i++]];
if (!(state & limit))
return idx+i/Char.sizeof
-length;
{
s._index += i/Char.sizeof-length;
return true;
}
state = (state<<1) | table[p[i++]];
if (!(state & limit))
return idx+i/Char.sizeof
-length;
{
s._index += i/Char.sizeof-length;
return true;
}
debug(std_regex_search) writefln("State: %32b", state);
}
}
}
return haystack.length;
s._index = haystack.length;
return false;
}

final @trusted bool match(ref Input!Char s) const
{
//TODO: stub
return false;
}

@system debug static void dump(uint[] table)
{//@@@BUG@@@ writef(ln) is @system
import std.stdio : writefln;
for (size_t i = 0; i < table.length; i += 4)
{
writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]);
debug writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]);
}
}
}

unittest
{
import std.conv, std.regex;
@trusted void test_fixed(alias Kick)()
auto shiftOrLength(C)(const(C)[] pat, uint length)
{
foreach (i, v; AliasSeq!(char, wchar, dchar))
auto r = regex(pat, "s");
auto kick = new ShiftOr!C(r);
assert(kick.length == length, text(C.stringof, " == ", kick.length));
return kick;
}
auto searches(C)(const (C)[] source, ShiftOr!C kick, uint[] results...)
{
auto inp = Input!C(source);
foreach (r; results)
{
alias Char = v;
alias String = immutable(v)[];
auto r = regex(to!String(`abc$`));
auto kick = Kick!Char(r, new uint[256]);
assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length));
auto r2 = regex(to!String(`(abc){2}a+`));
kick = Kick!Char(r2, new uint[256]);
assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length));
auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`));
kick = Kick!Char(r3, new uint[256]);
assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length));
auto r4 = regex(to!String(`\ba{2}c\bxyz`));
kick = Kick!Char(r4, new uint[256]);
assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length));
auto r5 = regex(to!String(`\ba{2}c\b`));
kick = Kick!Char(r5, new uint[256]);
size_t x = kick.search("aabaacaa", 0);
assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length));
x = kick.search("aabaacaa", x+1);
assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length));
kick.search(inp);
dchar ch;
size_t idx;
assert(inp._index == r, text(inp._index, " vs ", r));
inp.nextChar(ch, idx);
}
}
@trusted void test_flex(alias Kick)()


foreach (i, Char; AliasSeq!(char, wchar, dchar))
{
foreach (i, v; AliasSeq!(char, wchar, dchar))
{
alias Char = v;
alias String = immutable(v)[];
auto r = regex(to!String(`abc[a-z]`));
auto kick = Kick!Char(r, new uint[256]);
auto x = kick.search(to!String("abbabca"), 0);
assert(x == 3, text("real x is ", x, " ",v.stringof));
alias String = immutable(Char)[];
shiftOrLength(`abc`.to!String, 3);
shiftOrLength(`abc$`.to!String, 3);
shiftOrLength(`(abc){2}a+`.to!String, 7);
shiftOrLength(`\b(a{2}b{3}){2,4}`.to!String, 10);
shiftOrLength(`\ba{2}c\bxyz`.to!String, 6);
auto kick = shiftOrLength(`\ba{2}c\b`.to!String, 3);
auto inp = Input!Char("aabaacaa");
assert(kick.search(inp));
assert(inp._index == 3, text(Char.stringof," == ", kick.length));
dchar ch;
size_t idx;
inp.nextChar(ch, idx);
assert(!kick.search(inp));
assert(inp._index == 8, text(Char.stringof," == ", kick.length));
}

auto r2 = regex(to!String(`(ax|bd|cdy)`));
String s2 = to!String("abdcdyabax");
kick = Kick!Char(r2, new uint[256]);
x = kick.search(s2, 0);
assert(x == 1, text("real x is ", x));
x = kick.search(s2, x+1);
assert(x == 3, text("real x is ", x));
x = kick.search(s2, x+1);
assert(x == 8, text("real x is ", x));
auto rdot = regex(to!String(`...`));
kick = Kick!Char(rdot, new uint[256]);
assert(kick.length == 0);
auto rN = regex(to!String(`a(b+|c+)x`));
kick = Kick!Char(rN, new uint[256]);
assert(kick.length == 3, to!string(kick.length));
assert(kick.search("ababx",0) == 2);
assert(kick.search("abaacba",0) == 3);//expected inexact
foreach (i, Char; AliasSeq!(char, wchar, dchar))
{
alias String = immutable(Char)[];
auto kick = shiftOrLength(`abc[a-z]`.to!String, 4);
searches("abbabca".to!String, kick, 3);
kick = shiftOrLength(`(axx|bdx|cdy)`.to!String, 3);
searches("abdcdxabax".to!String, kick, 3);

}
shiftOrLength(`...`.to!String, 0);
kick = shiftOrLength(`a(b{1,2}|c{1,2})x`.to!String, 3);
searches("ababx".to!String, kick, 2);
searches("abaacba".to!String, kick, 3); //expected inexact
}
test_fixed!(ShiftOr)();
test_flex!(ShiftOr)();

}

alias Kickstart = ShiftOr;
584 changes: 8 additions & 576 deletions std/regex/internal/tests.d

Large diffs are not rendered by default.

268 changes: 268 additions & 0 deletions std/regex/internal/tests2.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
/*
Regualar expressions package test suite part 2.
*/
module std.regex.internal.tests2;

package(std.regex):

import std.algorithm, std.conv, std.exception, std.meta, std.range,
std.typecons, std.regex;

import std.regex.internal.parser : Escapables; // characters that need escaping


unittest
{
auto cr5 = ctRegex!("(?:a{2,4}b{1,3}){1,2}");
assert(bmatch("aaabaaaabbb", cr5).hit == "aaabaaaabbb");
auto cr6 = ctRegex!("(?:a{2,4}b{1,3}){1,2}?"w);
assert(bmatch("aaabaaaabbb"w, cr6).hit == "aaab"w);
}

unittest
{
auto cr7 = ctRegex!(`\r.*?$`,"sm");
assert(bmatch("abc\r\nxy", cr7).hit == "\r\nxy");
auto greed = ctRegex!("<packet.*?/packet>");
assert(bmatch("<packet>text</packet><packet>text</packet>", greed).hit
== "<packet>text</packet>");
}

unittest
{
auto cr8 = ctRegex!("^(a)(b)?(c*)");
auto m8 = bmatch("abcc",cr8);
assert(m8);
assert(m8.captures[1] == "a");
assert(m8.captures[2] == "b");
assert(m8.captures[3] == "cc");
auto cr9 = ctRegex!("q(a|b)*q");
auto m9 = match("xxqababqyy",cr9);
assert(m9);
assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"]));
}

unittest
{
auto rtr = regex("a|b|c");
const ctr = regex("a|b|c");
assert(equal(rtr.ir,ctr.ir));
//CTFE parser BUG is triggered by group
//in the middle of alternation (at least not first and not last)
const testCT = regex(`abc|(edf)|xyz`);
auto testRT = regex(`abc|(edf)|xyz`);
assert(equal(testCT.ir,testRT.ir));
}

unittest
{
immutable cx = ctRegex!"(A|B|C)";
auto mx = match("B",cx);
assert(mx);
assert(equal(mx.captures, [ "B", "B"]));
immutable cx2 = ctRegex!"(A|B)*";
assert(match("BAAA",cx2));

immutable cx3 = ctRegex!("a{3,4}","i");
auto mx3 = match("AaA",cx3);
assert(mx3);
assert(mx3.captures[0] == "AaA");
immutable cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i");
auto mx4 = match("aaaabc", cx4);
assert(mx4);
assert(mx4.captures[0] == "aaaab");
auto cr8 = ctRegex!("(a)(b)?(c*)");
auto m8 = bmatch("abcc",cr8);
assert(m8);
assert(m8.captures[1] == "a");
assert(m8.captures[2] == "b");
assert(m8.captures[3] == "cc");
auto cr9 = ctRegex!(".*$", "gm");
auto m9 = match("First\rSecond", cr9);
assert(m9);
assert(equal(map!"a.hit"(m9), ["First", "", "Second"]));
}

unittest
{
//global matching
void test_body(alias matchFn)()
{
string s = "a quick brown fox jumps over a lazy dog";
auto r1 = regex("\\b[a-z]+\\b","g");
string[] test;
foreach (m; matchFn(s, r1))
test ~= m.hit;
assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"]));
auto free_reg = regex(`
abc
\s+
"
(
[^"]+
| \\ "
)+
"
z
`, "x");
auto m = match(`abc "quoted string with \" inside"z`,free_reg);
assert(m);
string mails = " hey@you.com no@spam.net ";
auto rm = regex(`@(?<=\S+@)\S+`,"g");
assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"]));
auto m2 = matchFn("First line\nSecond line",regex(".*$","gm"));
assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"]));
auto m2a = matchFn("First line\nSecond line",regex(".+$","gm"));
assert(equal(map!"a[0]"(m2a), ["First line", "Second line"]));
auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm"));
assert(equal(map!"a[0]"(m2b), ["First line", "Second line"]));
debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!");
}
test_body!bmatch();
test_body!match();
}

//tests for accumulated std.regex issues and other regressions
unittest
{
void test_body(alias matchFn)()
{
//issue 5857
//matching goes out of control if ... in (...){x} has .*/.+
auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures;
assert(c[0] == "axxxzayyyyyzd");
assert(c[1] == "ayyyyyz");
auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures;
assert(c2[0] == "axxxayyyyyd");
assert(c2[1] == "ayyyyy");
//issue 2108
//greedy vs non-greedy
auto nogreed = regex("<packet.*?/packet>");
assert(matchFn("<packet>text</packet><packet>text</packet>", nogreed).hit
== "<packet>text</packet>");
auto greed = regex("<packet.*/packet>");
assert(matchFn("<packet>text</packet><packet>text</packet>", greed).hit
== "<packet>text</packet><packet>text</packet>");
//issue 4574
//empty successful match still advances the input
string[] pres, posts, hits;
foreach (m; matchFn("abcabc", regex("","g"))) {
pres ~= m.pre;
posts ~= m.post;
assert(m.hit.empty);

}
auto heads = [
"abcabc",
"abcab",
"abca",
"abc",
"ab",
"a",
""
];
auto tails = [
"abcabc",
"bcabc",
"cabc",
"abc",
"bc",
"c",
""
];
assert(pres == array(retro(heads)));
assert(posts == tails);
//issue 6076
//regression on .*
auto re = regex("c.*|d");
auto m = matchFn("mm", re);
assert(!m);
debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!");
auto rprealloc = regex(`((.){5}.{1,10}){5}`);
auto arr = array(repeat('0',100));
auto m2 = matchFn(arr, rprealloc);
assert(m2);
assert(collectException(
regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$")
) is null);
foreach (ch; [Escapables])
{
assert(match(to!string(ch),regex(`[\`~ch~`]`)));
assert(!match(to!string(ch),regex(`[^\`~ch~`]`)));
assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`)));
}
//bugzilla 7718
string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'";
auto reStrCmd = regex (`(".*")|('.*')`, "g");
assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)),
[`"/GIT/Ruby Apps/sec"`, `'notimer'`]));
}
test_body!bmatch();
test_body!match();
}

// tests for replace
unittest
{
void test(alias matchFn)()
{
import std.uni : toUpper;

foreach (i, v; AliasSeq!(string, wstring, dstring))
{
auto baz(Cap)(Cap m)
if (is(Cap == Captures!(Cap.String)))
{
return toUpper(m.hit);
}
alias String = v;
assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c"))
== to!String("ack rapacity"));
assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c"))
== to!String("ack capacity"));
assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]"))
== to!String("[n]oon"));
assert(std.regex.replace!(matchFn)(to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'"))
== to!String(": test2 test1 :"));
auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."),
regex(to!String("[ar]"), "g"));
assert(s == "StRAp A Rocket engine on A chicken.", text(s));
}
debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!");
}
test!(bmatch)();
test!(match)();
}

// tests for splitter
unittest
{
auto s1 = ", abc, de, fg, hi, ";
auto sp1 = splitter(s1, regex(", *"));
auto w1 = ["", "abc", "de", "fg", "hi", ""];
assert(equal(sp1, w1));

auto s2 = ", abc, de, fg, hi";
auto sp2 = splitter(s2, regex(", *"));
auto w2 = ["", "abc", "de", "fg", "hi"];

uint cnt;
foreach (e; sp2) {
assert(w2[cnt++] == e);
}
assert(equal(sp2, w2));
}

unittest
{
char[] s1 = ", abc, de, fg, hi, ".dup;
auto sp2 = splitter(s1, regex(", *"));
}

unittest
{
auto s1 = ", abc, de, fg, hi, ";
auto w1 = ["", "abc", "de", "fg", "hi", ""];
assert(equal(split(s1, regex(", *")), w1[]));
}
305 changes: 305 additions & 0 deletions std/regex/internal/tests3.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
/*
Regualar expressions package test suite part 3.
*/
module std.regex.internal.tests3;

package(std.regex):

import std.algorithm, std.conv, std.exception, std.meta, std.range,
std.typecons, std.regex;

unittest
{ // bugzilla 7141
string pattern = `[a\--b]`;
assert(match("-", pattern));
assert(match("b", pattern));
string pattern2 = `[&-z]`;
assert(match("b", pattern2));
}
unittest
{//bugzilla 7111
assert(match("", regex("^")));
}
unittest
{//bugzilla 7300
assert(!match("a"d, "aa"d));
}

// bugzilla 7551
unittest
{
auto r = regex("[]abc]*");
assert("]ab".matchFirst(r).hit == "]ab");
assertThrown(regex("[]"));
auto r2 = regex("[]abc--ab]*");
assert("]ac".matchFirst(r2).hit == "]");
}

unittest
{//bugzilla 7674
assert("1234".replace(regex("^"), "$$") == "$1234");
assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?");
assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?");
}
unittest
{// bugzilla 7679
foreach (S; AliasSeq!(string, wstring, dstring))
(){ // avoid slow optimizations for large functions @@@BUG@@@ 2396
const re = ctRegex!(to!S(r"\."));
auto str = to!S("a.b");
assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")]));
assert(split(str, re) == [to!S("a"), to!S("b")]);
}();
}
unittest
{//bugzilla 8203
string data = "
NAME = XPAW01_STA:STATION
NAME = XPAW01_STA
";
auto uniFileOld = data;
auto r = regex(
r"^NAME = (?P<comp>[a-zA-Z0-9_]+):*(?P<blk>[a-zA-Z0-9_]*)","gm");
auto uniCapturesNew = match(uniFileOld, r);
for (int i = 0; i < 20; i++)
foreach (matchNew; uniCapturesNew) {}
//a second issue with same symptoms
auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`);
match("аллея Театральная", r2);
}
unittest
{// bugzilla 8637 purity of enforce
auto m = match("hello world", regex("world"));
enforce(m);
}

// bugzilla 8725
unittest
{
static italic = regex( r"\*
(?!\s+)
(.*?)
(?!\s+)
\*", "gx" );
string input = "this * is* interesting, *very* interesting";
assert(replace(input, italic, "<i>$1</i>") ==
"this * is* interesting, <i>very</i> interesting");
}

// bugzilla 8349
unittest
{
const peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)</a>";
const peakRegex = ctRegex!(peakRegexStr);
//note that the regex pattern itself is probably bogus
assert(match(r"\>wgEncode-blah-Tfbs.narrow</a>", peakRegex));
}

// bugzilla 9211
unittest
{
auto rx_1 = regex(r"^(\w)*(\d)");
auto m = match("1234", rx_1);
assert(equal(m.front, ["1234", "3", "4"]));
auto rx_2 = regex(r"^([0-9])*(\d)");
auto m2 = match("1234", rx_2);
assert(equal(m2.front, ["1234", "3", "4"]));
}

// bugzilla 9280
unittest
{
string tomatch = "a!b@c";
static r = regex(r"^(?P<nick>.*?)!(?P<ident>.*?)@(?P<host>.*?)$");
auto nm = match(tomatch, r);
assert(nm);
auto c = nm.captures;
assert(c[1] == "a");
assert(c["nick"] == "a");
}


// bugzilla 9579
unittest
{
char[] input = ['a', 'b', 'c'];
string format = "($1)";
// used to give a compile error:
auto re = regex(`(a)`, "g");
auto r = replace(input, re, format);
assert(r == "(a)bc");
}

// bugzilla 9634
unittest
{
auto re = ctRegex!"(?:a+)";
assert(match("aaaa", re).hit == "aaaa");
}

//bugzilla 10798
unittest
{
auto cr = ctRegex!("[abcd--c]*");
auto m = "abc".match(cr);
assert(m);
assert(m.hit == "ab");
}

// bugzilla 10913
unittest
{
@system static string foo(const(char)[] s)
{
return s.dup;
}
@safe static string bar(const(char)[] s)
{
return s.dup;
}
() @system {
replace!((a) => foo(a.hit))("blah", regex(`a`));
}();
() @safe {
replace!((a) => bar(a.hit))("blah", regex(`a`));
}();
}

// bugzilla 11262
unittest
{
const reg = ctRegex!(r",", "g");
auto str = "This,List";
str = str.replace(reg, "-");
assert(str == "This-List");
}

// bugzilla 11775
unittest
{
assert(collectException(regex("a{1,0}")));
}

// bugzilla 11839
unittest
{
assert(regex(`(?P<var1>\w+)`).namedCaptures.equal(["var1"]));
assert(collectException(regex(`(?P<1>\w+)`)));
assert(regex(`(?P<v1>\w+)`).namedCaptures.equal(["v1"]));
assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"]));
assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"]));
}

// bugzilla 12076
unittest
{
auto RE = ctRegex!(r"(?<!x[a-z]+)\s([a-z]+)");
string s = "one two";
auto m = match(s, RE);
}

// bugzilla 12105
unittest
{
auto r = ctRegex!`.*?(?!a)`;
assert("aaab".matchFirst(r).hit == "aaa");
auto r2 = ctRegex!`.*(?!a)`;
assert("aaab".matchFirst(r2).hit == "aaab");
}

//bugzilla 11784
unittest
{
assert("abcdefghijklmnopqrstuvwxyz"
.matchFirst("[a-z&&[^aeiuo]]").hit == "b");
}

//bugzilla 12366
unittest
{
auto re = ctRegex!(`^((?=(xx+?)\2+$)((?=\2+$)(?=(x+)(\4+$))\5){2})*x?$`);
assert("xxxxxxxx".match(re).empty);
assert(!"xxxx".match(re).empty);
}

// bugzilla 12582
unittest
{
auto r = regex(`(?P<a>abc)`);
assert(collectException("abc".matchFirst(r)["b"]));
}

// bugzilla 12691
unittest
{
assert(bmatch("e@", "^([a-z]|)*$").empty);
assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty);
}

//bugzilla 12713
unittest
{
assertThrown(regex("[[a-z]([a-z]|(([[a-z])))"));
}

//bugzilla 12747
unittest
{
assertThrown(regex(`^x(\1)`));
assertThrown(regex(`^(x(\1))`));
assertThrown(regex(`^((x)(?=\1))`));
}

// bugzilla 14504
unittest
{
auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
}

// bugzilla 14529
unittest
{
auto ctPat2 = regex(r"^[CDF]$", "i");
foreach (v; ["C", "c", "D", "d", "F", "f"])
assert(matchAll(v, ctPat2).front.hit == v);
}

// bugzilla 14615
unittest
{
import std.stdio : writeln;
import std.regex : replaceFirst, replaceFirstInto, regex;
import std.array : appender;

auto example = "Hello, world!";
auto pattern = regex("^Hello, (bug)"); // won't find this one
auto result = replaceFirst(example, pattern, "$1 Sponge Bob");
assert(result == "Hello, world!"); // Ok.

auto sink = appender!string;
replaceFirstInto(sink, example, pattern, "$1 Sponge Bob");
assert(sink.data == "Hello, world!");
replaceAllInto(sink, example, pattern, "$1 Sponge Bob");
assert(sink.data == "Hello, world!Hello, world!");
}

// bugzilla 15573
unittest
{
auto rx = regex("[c d]", "x");
assert("a b".matchFirst(rx));
}

// bugzilla 15864
unittest
{
regex(`(<a (?:(?:\w+=\"[^"]*\")?\s*)*href="\.\.?)"`);
}

unittest
{
auto r = regex("(?# comment)abc(?# comment2)");
assert("abc".matchFirst(r));
assertThrown(regex("(?#..."));
}
202 changes: 111 additions & 91 deletions std/regex/internal/thompson.d

Large diffs are not rendered by default.

210 changes: 102 additions & 108 deletions std/regex/package.d

Large diffs are not rendered by default.

13 changes: 0 additions & 13 deletions std/uni.d
Original file line number Diff line number Diff line change
Expand Up @@ -2120,19 +2120,6 @@ public:
assert(!gothic['$']);
}

// Linear scan for $(D ch). Useful only for small sets.
// TODO:
// used internally in std.regex
// should be properly exposed in a public API ?
package auto scanFor()(dchar ch) const
{
immutable len = data.length;
for (size_t i = 0; i < len; i++)
if (ch < data[i])
return i & 1;
return 0;
}

/// Number of $(CODEPOINTS) in this set
@property size_t length()
{
Expand Down
29 changes: 21 additions & 8 deletions win32.mak
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,19 @@ SRC_STD_RANGE= \
SRC_STD_REGEX= \
std\regex\internal\ir.d \
std\regex\package.d \
std\regex\internal\parser.d \
std\regex\internal\tests.d \
std\regex\internal\generator.d

SRC_STD_REGEX_2 = \
std\regex\internal\parser.d \
std\regex\internal\backtracking.d \
std\regex\internal\thompson.d \
std\regex\internal\kickstart.d \
std\regex\internal\generator.d
std\regex\internal\tests2.d

SRC_STD_REGEX_3 = \
std\regex\internal\shiftor.d \
std\regex\internal\bitnfa.d \
std\regex\internal\tests3.d

SRC_STD_C= \
std\c\process.d \
Expand Down Expand Up @@ -352,6 +359,8 @@ SRC_TO_COMPILE= \
$(SRC_STD_NET) \
$(SRC_STD_RANGE) \
$(SRC_STD_REGEX) \
$(SRC_STD_REGEX_2) \
$(SRC_STD_REGEX_3) \
$(SRC_STD_C) \
$(SRC_STD_WIN) \
$(SRC_STD_C_WIN) \
Expand Down Expand Up @@ -571,6 +580,8 @@ UNITTEST_OBJS= \
unittest8d.obj \
unittest8e.obj \
unittest8f.obj \
unittest8g.obj \
unittest8h.obj \
unittest9a.obj

unittest : $(LIB)
Expand All @@ -585,11 +596,13 @@ unittest : $(LIB)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest6.obj $(SRC_STD_6) $(SRC_STD_CONTAINER) $(SRC_STD_EXP_ALLOC) $(SRC_STD_EXP_LOGGER)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest7.obj $(SRC_STD_7)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_NET)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_EXP)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_NET)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8h.obj $(SRC_STD_EXP)
$(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE)
$(DMD) $(UDFLAGS) -L/co -unittest unittest.d $(UNITTEST_OBJS) \
$(ZLIB) $(DRUNTIMELIB)
Expand Down
27 changes: 19 additions & 8 deletions win64.mak
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,19 @@ SRC_STD_RANGE= \
SRC_STD_REGEX= \
std\regex\internal\ir.d \
std\regex\package.d \
std\regex\internal\parser.d \
std\regex\internal\tests.d \
std\regex\internal\generator.d

SRC_STD_REGEX_2 = \
std\regex\internal\parser.d \
std\regex\internal\backtracking.d \
std\regex\internal\thompson.d \
std\regex\internal\kickstart.d \
std\regex\internal\generator.d
std\regex\internal\tests2.d

SRC_STD_REGEX_3 = \
std\regex\internal\shiftor.d \
std\regex\internal\bitnfa.d \
std\regex\internal\tests3.d

SRC_STD_C= \
std\c\process.d \
Expand Down Expand Up @@ -371,6 +378,8 @@ SRC_TO_COMPILE= \
$(SRC_STD_NET) \
$(SRC_STD_RANGE) \
$(SRC_STD_REGEX) \
$(SRC_STD_REGEX_2) \
$(SRC_STD_REGEX_3) \
$(SRC_STD_C) \
$(SRC_STD_WIN) \
$(SRC_STD_C_WIN) \
Expand Down Expand Up @@ -621,11 +630,13 @@ unittest : $(LIB)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest6i.obj $(SRC_STD_6i)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest7.obj $(SRC_STD_7) $(SRC_STD_EXP_LOGGER)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_NET)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_EXP)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_NET)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest8h.obj $(SRC_STD_EXP)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest9.obj $(SRC_STD_EXP_ALLOC)
$(DMD) $(UDFLAGS) -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE)
$(DMD) $(UDFLAGS) -L/OPT:NOICF -unittest unittest.d $(UNITTEST_OBJS) \
Expand Down