Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 1a628dc

Browse files
authored
Improve throughput of String.Split(char / char[], ...) (#15322)
* Improve throughput of String.Split(char / char[], ...) Also reduces the amount of unsafe code used. * Address PR feedback
1 parent 1a62b23 commit 1a628dc

File tree

3 files changed

+80
-38
lines changed

3 files changed

+80
-38
lines changed

src/mscorlib/shared/System/String.Searching.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ private unsafe int IndexOfCharArray(char[] anyOf, int startIndex, int count)
252252
// in each byte in the character is used to index into this map to get the
253253
// right block, the value of the remaining 5 msb are used as the bit position
254254
// inside this block.
255-
private static unsafe void InitializeProbabilisticMap(uint* charMap, char[] anyOf)
255+
private static unsafe void InitializeProbabilisticMap(uint* charMap, ReadOnlySpan<char> anyOf)
256256
{
257257
bool hasAscii = false;
258258
uint* charMapLocal = charMap; // https://github.com/dotnet/coreclr/issues/14264

src/mscorlib/shared/System/StringSpanHelpers.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,19 @@ public static int IndexOf(this ReadOnlySpan<char> source, char value, int startI
9090
return -1;
9191
}
9292

93+
public static bool Contains(this ReadOnlySpan<char> source, char value)
94+
{
95+
for (int i = 0; i < source.Length; i++)
96+
{
97+
if (source[i] == value)
98+
{
99+
return true;
100+
}
101+
}
102+
103+
return false;
104+
}
105+
93106
public static ReadOnlySpan<char> Remove(this ReadOnlySpan<char> source, int startIndex, int count)
94107
{
95108
if (startIndex < 0) throw new ArgumentOutOfRangeException(nameof(startIndex), SR.ArgumentOutOfRange_StartIndex);

src/mscorlib/src/System/String.Manipulation.cs

Lines changed: 66 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,14 +1089,14 @@ public String Replace(String oldValue, String newValue)
10891089
return ReplaceInternal(oldValue, newValue);
10901090
}
10911091

1092-
public unsafe String[] Split(char separator, StringSplitOptions options = StringSplitOptions.None)
1092+
public String[] Split(char separator, StringSplitOptions options = StringSplitOptions.None)
10931093
{
1094-
return SplitInternal(&separator, 1, int.MaxValue, options);
1094+
return SplitInternal(new ReadOnlySpan<char>(ref separator, 1), int.MaxValue, options);
10951095
}
10961096

1097-
public unsafe String[] Split(char separator, int count, StringSplitOptions options = StringSplitOptions.None)
1097+
public String[] Split(char separator, int count, StringSplitOptions options = StringSplitOptions.None)
10981098
{
1099-
return SplitInternal(&separator, 1, count, options);
1099+
return SplitInternal(new ReadOnlySpan<char>(ref separator, 1), count, options);
11001100
}
11011101

11021102
// Creates an array of strings by splitting this string at each
@@ -1139,16 +1139,7 @@ public String[] Split(char[] separator, int count, StringSplitOptions options)
11391139
return SplitInternal(separator, count, options);
11401140
}
11411141

1142-
private unsafe String[] SplitInternal(char[] separator, int count, StringSplitOptions options)
1143-
{
1144-
fixed (char* pSeparators = separator)
1145-
{
1146-
int separatorsLength = separator == null ? 0 : separator.Length;
1147-
return SplitInternal(pSeparators, separatorsLength, count, options);
1148-
}
1149-
}
1150-
1151-
private unsafe String[] SplitInternal(char* separators, int separatorsLength, int count, StringSplitOptions options)
1142+
private String[] SplitInternal(ReadOnlySpan<char> separators, int count, StringSplitOptions options)
11521143
{
11531144
if (count < 0)
11541145
throw new ArgumentOutOfRangeException(nameof(count),
@@ -1170,7 +1161,7 @@ private unsafe String[] SplitInternal(char* separators, int separatorsLength, in
11701161
}
11711162

11721163
int[] sepList = new int[Length];
1173-
int numReplaces = MakeSeparatorList(separators, separatorsLength, sepList);
1164+
int numReplaces = MakeSeparatorList(separators, sepList);
11741165

11751166
// Handle the special case of no replaces.
11761167
if (0 == numReplaces)
@@ -1377,45 +1368,83 @@ private String[] SplitOmitEmptyEntries(Int32[] sepList, Int32[] lengthList, Int3
13771368
// Args: separator -- A string containing all of the split characters.
13781369
// sepList -- an array of ints for split char indicies.
13791370
//--------------------------------------------------------------------
1380-
private unsafe int MakeSeparatorList(char* separators, int separatorsLength, int[] sepList)
1371+
private int MakeSeparatorList(ReadOnlySpan<char> separators, int[] sepList)
13811372
{
1382-
Debug.Assert(separatorsLength >= 0, "separatorsLength >= 0");
13831373
int foundCount = 0;
1374+
char sep0, sep1, sep2;
13841375

1385-
if (separators == null || separatorsLength == 0)
1376+
switch (separators.Length)
13861377
{
1387-
fixed (char* pwzChars = &_firstChar)
1388-
{
1389-
//If they passed null or an empty string, look for whitespace.
1390-
for (int i = 0; i < Length && foundCount < sepList.Length; i++)
1378+
// Special-case no separators to mean any whitespace is a separator.
1379+
case 0:
1380+
for (int i = 0; i < Length; i++)
13911381
{
1392-
if (Char.IsWhiteSpace(pwzChars[i]))
1382+
if (char.IsWhiteSpace(this[i]))
13931383
{
13941384
sepList[foundCount++] = i;
13951385
}
13961386
}
1397-
}
1398-
}
1399-
else
1400-
{
1401-
int sepListCount = sepList.Length;
1402-
//If they passed in a string of chars, actually look for those chars.
1403-
fixed (char* pwzChars = &_firstChar)
1404-
{
1405-
for (int i = 0; i < Length && foundCount < sepListCount; i++)
1387+
break;
1388+
1389+
// Special-case the common cases of 1, 2, and 3 separators, with manual comparisons against each separator.
1390+
case 1:
1391+
sep0 = separators[0];
1392+
for (int i = 0; i < Length; i++)
1393+
{
1394+
if (this[i] == sep0)
1395+
{
1396+
sepList[foundCount++] = i;
1397+
}
1398+
}
1399+
break;
1400+
case 2:
1401+
sep0 = separators[0];
1402+
sep1 = separators[1];
1403+
for (int i = 0; i < Length; i++)
14061404
{
1407-
char* pSep = separators;
1408-
for (int j = 0; j < separatorsLength; j++, pSep++)
1405+
char c = this[i];
1406+
if (c == sep0 || c == sep1)
14091407
{
1410-
if (pwzChars[i] == *pSep)
1408+
sepList[foundCount++] = i;
1409+
}
1410+
}
1411+
break;
1412+
case 3:
1413+
sep0 = separators[0];
1414+
sep1 = separators[1];
1415+
sep2 = separators[2];
1416+
for (int i = 0; i < Length; i++)
1417+
{
1418+
char c = this[i];
1419+
if (c == sep0 || c == sep1 || c == sep2)
1420+
{
1421+
sepList[foundCount++] = i;
1422+
}
1423+
}
1424+
break;
1425+
1426+
// Handle > 3 separators with a probabilistic map, ala IndexOfAny.
1427+
// This optimizes for chars being unlikely to match a separator.
1428+
default:
1429+
unsafe
1430+
{
1431+
ProbabilisticMap map = default;
1432+
uint* charMap = (uint*)&map;
1433+
InitializeProbabilisticMap(charMap, separators);
1434+
1435+
for (int i = 0; i < Length; i++)
1436+
{
1437+
char c = this[i];
1438+
if (IsCharBitSet(charMap, (byte)c) && IsCharBitSet(charMap, (byte)(c >> 8)) &&
1439+
separators.Contains(c))
14111440
{
14121441
sepList[foundCount++] = i;
1413-
break;
14141442
}
14151443
}
14161444
}
1417-
}
1445+
break;
14181446
}
1447+
14191448
return foundCount;
14201449
}
14211450

0 commit comments

Comments
 (0)