Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release/6.0-preview7] Eliminate backtracking in the interpreter for patterns with .* #55960

Merged
merged 17 commits into from
Jul 20, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanSliceIntIntMethod = typeof(ReadOnlySpan<char>).GetMethod("Slice", new Type[] { typeof(int), typeof(int) })!;
private static readonly MethodInfo s_spanStartsWith = typeof(MemoryExtensions).GetMethod("StartsWith", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_stringAsSpanMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string) })!;
private static readonly MethodInfo s_spanLastIndexOfMethod = typeof(MemoryExtensions).GetMethod("LastIndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!;
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!;
Expand All @@ -90,6 +91,7 @@ internal abstract class RegexCompiler
private LocalBuilder? _runstackLocal;
private LocalBuilder? _textInfoLocal; // cached to avoid extraneous TLS hits from CurrentCulture and virtual calls to TextInfo
private LocalBuilder? _loopTimeoutCounterLocal; // timeout counter for setrep and setloop
private LocalBuilder? _maxBacktrackPositionLocal;

protected RegexOptions _options; // options
protected RegexCode? _code; // the RegexCode object
Expand Down Expand Up @@ -891,6 +893,8 @@ private void GenerateForwardSection()
Mvfldloc(s_runtrackposField, _runtrackposLocal!);
Mvfldloc(s_runstackField, _runstackLocal!);
Mvfldloc(s_runstackposField, _runstackposLocal!);
Ldc(-1);
Stloc(_maxBacktrackPositionLocal!);

_backpos = -1;

Expand Down Expand Up @@ -1705,7 +1709,7 @@ protected void GenerateFindFirstChar()
// if (!CharInClass(textSpan[i + 2], prefix[2], "...")) goto returnFalse;
// ...
Debug.Assert(charClassIndex == 0 || charClassIndex == 1);
for ( ; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
for (; charClassIndex < _leadingCharClasses.Length; charClassIndex++)
{
Debug.Assert(needLoop);
Ldloca(textSpanLocal);
Expand Down Expand Up @@ -3310,6 +3314,7 @@ protected void GenerateGo()
}
_runtextbegLocal = DeclareInt32();
_runtextendLocal = DeclareInt32();
_maxBacktrackPositionLocal = DeclareInt32();

InitializeCultureForGoIfNecessary();

Expand Down Expand Up @@ -4258,7 +4263,61 @@ private void GenerateOneCode()
//: break Backward;
{
string str = _strings![Operand(0)];
Label multiCode = DefineLabel();
if (!IsRightToLeft())
{
// if (runtextend - runtextpos < c)
Ldloc(_runtextendLocal!);
Ldloc(_runtextposLocal!);
Sub();
Ldc(str.Length);
BgeFar(multiCode);
// if (!caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
if (!IsCaseInsensitive())
{
Ldloc(_maxBacktrackPositionLocal!);
Ldc(-1);
BeqFar(_backtrack);
Ldloc(_runtextposLocal!);
Ldloc(_maxBacktrackPositionLocal!);
BleFar(_backtrack);
// runtextpos = _maxBacktrackPosition;
Ldloc(_maxBacktrackPositionLocal!);
Stloc(_runtextposLocal!);
// ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktractPosition);
Ldloc(_runtextLocal!);
Ldloc(_maxBacktrackPositionLocal!);
Ldloc(_runtextendLocal!);
Ldloc(_maxBacktrackPositionLocal!);
Sub();
using (RentedLocalBuilder runtextSpanLocal = RentReadOnlySpanCharLocal())
{
Call(s_stringAsSpanIntIntMethod);
Stloc(runtextSpanLocal);
using (RentedLocalBuilder lastIndexOfLocal = RentInt32Local())
{
// int lastIndexOf = runtextSpan.LastIndexOf(str.AsSpan());
Ldloc(runtextSpanLocal);
Ldstr(str);
Call(s_stringAsSpanMethod);
Call(s_spanLastIndexOfMethod);
Stloc(lastIndexOfLocal);
// if (lastIndexOf > -1)
Ldloc(lastIndexOfLocal);
Ldc(-1);
BleFar(_backtrack);
// runtextpos = lastIndexOf + _maxBacktrackPosition;
Ldloc(lastIndexOfLocal);
Ldloc(_maxBacktrackPositionLocal!);
Add();
Stloc(_runtextposLocal!);
BrFar(_backtrack);
}
}
}
}

MarkLabel(multiCode);
Ldc(str.Length);
Ldloc(_runtextendLocal!);
Ldloc(_runtextposLocal!);
Expand Down Expand Up @@ -4598,6 +4657,9 @@ private void GenerateOneCode()

using RentedLocalBuilder lenLocal = RentInt32Local();
using RentedLocalBuilder iLocal = RentInt32Local();
using RentedLocalBuilder tempMaxBacktrackPositionLocal = RentInt32Local();
Ldloc(_runtextposLocal!);
Stloc(tempMaxBacktrackPositionLocal);

if (!IsRightToLeft())
{
Expand Down Expand Up @@ -4847,6 +4909,12 @@ private void GenerateOneCode()
DoPush();

Track();
// if (_operator == RegexCode.Notoneloop) maxBacktrackPosition = tempMaxBacktrackPosition
if (_regexopcode == RegexCode.Notoneloop)
{
Ldloc(tempMaxBacktrackPositionLocal);
Stloc(_maxBacktrackPositionLocal!);
}
}
break;
}
Expand All @@ -4870,28 +4938,66 @@ private void GenerateOneCode()
//: if (i > 0)
//: Track(i - 1, pos - 1);
//: Advance(2);
PopTrack();
Stloc(_runtextposLocal!);
Label noBacktrackPositionBranch = DefineLabel();
PopTrack();
using (RentedLocalBuilder posLocal = RentInt32Local())
{
Stloc(posLocal);
Ldloc(posLocal);
Ldc(0);
BleFar(AdvanceLabel());
PopTrack();
using (RentedLocalBuilder iBacktrackLocal = RentInt32Local())
{
Stloc(iBacktrackLocal);
// if (!caseInsensitive && maxBacktrackPosition != -1 && pos > maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
if (!IsCaseInsensitive() && _regexopcode == (RegexCode.Notoneloop | RegexCode.Back) && !IsRightToLeft())
{
Ldloc(_maxBacktrackPositionLocal!);
Ldc(-1);
Beq(noBacktrackPositionBranch);
Ldloc(posLocal);
Ldloc(_maxBacktrackPositionLocal!);
Ble(noBacktrackPositionBranch);
Ldloc(_runtextposLocal!);
Ldloc(posLocal);
Bge(noBacktrackPositionBranch);
/*
int difference = pos - maxBacktrackPosition;
pos = runtextpos;
i -= difference;
maxBacktrackPosition = -1;
*/
// int difference = pos - maxBacktrackPosition;
Ldloc(iBacktrackLocal);
Ldloc(posLocal);
Ldloc(_maxBacktrackPositionLocal!);
Sub();
Sub();
Stloc(iBacktrackLocal);
Ldloc(_runtextposLocal!);
Stloc(posLocal);
Ldc(-1);
Stloc(_maxBacktrackPositionLocal!);
}

MarkLabel(noBacktrackPositionBranch);
Ldloc(posLocal);
Stloc(_runtextposLocal!);
Ldloc(iBacktrackLocal);
Ldc(0);
BleFar(AdvanceLabel());
ReadyPushTrack();
Ldloc(iBacktrackLocal);
}
Ldc(1);
Sub();
DoPush();
ReadyPushTrack();
Ldloc(posLocal);
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
DoPush();
Trackagain();
Advance();
}
Ldc(1);
Sub();
DoPush();
ReadyPushTrack();
Ldloc(_runtextposLocal!);
Ldc(1);
Sub(IsRightToLeft());
DoPush();
Trackagain();
Advance();
break;

case RegexCode.Onelazy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ internal sealed class RegexInterpreter : RegexRunner
private int _codepos;
private bool _rightToLeft;
private bool _caseInsensitive;
private int _maxBacktrackPosition = -1;

public RegexInterpreter(RegexCode code, CultureInfo culture)
{
Expand Down Expand Up @@ -223,6 +224,20 @@ private bool MatchString(string str)
{
if (runtextend - runtextpos < c)
{
// If MatchString was called after a greedy op such as a .*, we would have zipped runtextpos to the end without really examining any characters. Reset to maxBacktrackPos here as an optimization
if (!_caseInsensitive && _maxBacktrackPosition != -1 && runtextpos > _maxBacktrackPosition)
{
// If lastIndexOf is -1, we backtrack to the max extent possible.
runtextpos = _maxBacktrackPosition;
ReadOnlySpan<char> runtextSpan = runtext.AsSpan(_maxBacktrackPosition, runtextend - _maxBacktrackPosition);
int lastIndexOf = runtextSpan.LastIndexOf(str);
if (lastIndexOf > -1)
{
// Found the next position to match. Move runtextpos here
runtextpos = _maxBacktrackPosition + lastIndexOf;
}
}

return false;
}

Expand Down Expand Up @@ -1185,6 +1200,7 @@ protected override void Go()
int len = Math.Min(Operand(1), Forwardchars());
char ch = (char)Operand(0);
int i;
int tempMaxBacktrackPosition = runtextpos;

if (!_rightToLeft && !_caseInsensitive)
{
Expand Down Expand Up @@ -1217,6 +1233,7 @@ protected override void Go()
if (len > i && _operator == RegexCode.Notoneloop)
{
TrackPush(len - i - 1, runtextpos - Bump());
_maxBacktrackPosition = tempMaxBacktrackPosition;
}
}
advance = 2;
Expand Down Expand Up @@ -1261,6 +1278,16 @@ protected override void Go()
{
int i = TrackPeek();
int pos = TrackPeek(1);
if (!_caseInsensitive && _maxBacktrackPosition != -1 && pos > _maxBacktrackPosition && runtextpos < pos && _operator == (RegexCode.Notoneloop | RegexCode.Back) && !_rightToLeft)
{
// The Multi node has bumped us along already
int difference = pos - _maxBacktrackPosition;
Debug.Assert(difference > 0);
pos = runtextpos;
i -= difference;
// We shouldn't be backtracking anymore.
_maxBacktrackPosition = -1;
}
runtextpos = pos;
if (i > 0)
{
Expand Down
Loading