Add ASCII fast path to stripLeft & avoid unnecessary decoding #6727

n8sh · 2018-10-10T17:39:59Z

Similar to #6361. Code for benchmarks in next post.

string benchmark, ldc2:

compiler	function	data	time
ldc2 -O2 -release	old_stripLeft	ascii	1 sec, 65 ms, 71 μs, and 8 hnsecs
ldc2 -O2 -release	new_stripLeft	ascii	492 ms, 570 μs, and 2 hnsecs
ldc2 -O2 -release	old_stripLeft	non_ascii	1 sec, 852 ms, and 350 μs
ldc2 -O2 -release	new_stripLeft	non_ascii	1 sec, 209 ms, 13 μs, and 6 hnsecs

wstring benchmark, ldc2:

compiler	function	data	time
ldc2 -O2 -release	old_stripLeft	w_ascii	1 sec, 165 ms, 298 μs, and 7 hnsecs
ldc2 -O2 -release	new_stripLeft	w_ascii	780 ms, 142 μs, and 9 hnsecs
ldc2 -O2 -release	old_stripLeft	w_non_ascii	864 ms, 82 μs, and 6 hnsecs
ldc2 -O2 -release	new_stripLeft	w_non_ascii	591 ms, 621 μs, and 9 hnsecs

string benchmark, dmd:

compiler	function	data	time
dmd -O -inline -release	old_stripLeft	ascii	1 sec, 287 ms, 162 μs, and 2 hnsecs
dmd -O -inline -release	new_stripLeft	ascii	650 ms, 753 μs, and 4 hnsecs
dmd -O -inline -release	old_stripLeft	non_ascii	2 secs, 139 ms, 37 μs, and 1 hnsec
dmd -O -inline -release	new_stripLeft	non_ascii	1 sec, 738 ms, 224 μs, and 4 hnsecs

wstring benchmark, dmd:

compiler	function	data	time
dmd -O -inline -release	old_stripLeft	w_ascii	1 sec, 649 ms, 661 μs, and 2 hnsecs
dmd -O -inline -release	new_stripLeft	w_ascii	1 sec, 231 ms, 832 μs, and 5 hnsecs
dmd -O -inline -release	old_stripLeft	w_non_ascii	1 sec, 674 ms, 585 μs, and 8 hnsecs
dmd -O -inline -release	new_stripLeft	w_non_ascii	967 ms, 158 μs, and 2 hnsecs

dlang-bot · 2018-10-10T17:40:01Z

Thanks for your pull request, @n8sh!

Bugzilla references

Auto-close	Bugzilla	Severity	Description
✓	19308	enhancement	Optimize std.string.stripLeft

Testing this PR locally

If you don't have a local development environment setup, you can use Digger to test this PR:

dub fetch digger
dub run digger -- build "master + phobos#6727"

n8sh · 2018-10-10T19:28:10Z

EDIT: new benchmark code:

stripBench.d

//1st module
import std.traits;
import std.range;

auto old_stripLeft(Range)(Range input)
if (isForwardRange!Range && isSomeChar!(ElementEncodingType!Range) &&
    !isInfinite!Range && !isConvertibleToString!Range)
{
    static import std.ascii;
    static import std.uni;
    import std.utf : decodeFront;

    while (!input.empty)
    {
        auto c = input.front;
        if (std.ascii.isASCII(c))
        {
            if (!std.ascii.isWhite(c))
                break;
            input.popFront();
        }
        else
        {
            auto save = input.save;
            auto dc = decodeFront(input);
            if (!std.uni.isWhite(dc))
                return save;
        }
    }
    return input;
}

auto new_stripLeft(Range)(Range input)
if (isForwardRange!Range && isSomeChar!(ElementEncodingType!Range) &&
    !isInfinite!Range && !isConvertibleToString!Range)
{
    static import std.ascii;
    static import std.uni;

    static if (is(Unqual!(ElementEncodingType!Range) == dchar)
        || is(Unqual!(ElementEncodingType!Range) == wchar))
    {
        // Decoding is never needed for dchar. It happens not to be needed
        // here for wchar because no whitepace is outside the basic
        // multilingual plane meaning every whitespace character is encoded
        // with a single wchar and due to the design of UTF-16 those wchars
        // will not occur as part of the encoding of multi-wchar codepoints.
        static if (isDynamicArray!Range || (isRandomAccessRange!Range && hasLength!Range
            && __traits(compiles, { size_t s = input.length / 2; input = input[s .. $]; })))
        {
            size_t i = 0;
            for (const size_t end = input.length; i < end; ++i)
            {
                if (!std.uni.isWhite(input[i]))
                    break;
            }
            input = input[i .. $];
            return input;
        }
        else
        {
            while (!input.empty)
            {
                if (!std.uni.isWhite(input.front))
                    break;
                input.popFront();
            }
            return input;
        }
    }
    else
    {
        static if (isDynamicArray!Range || (isRandomAccessRange!Range && hasLength!Range
            && __traits(compiles, { size_t s = input.length / 2; input = input[s .. $]; })))
        {{
            // ASCII optimization for dynamic arrays & similar.
            size_t i = 0;
            for (const size_t end = input.length; i < end; ++i)
            {
                auto c = input[i];
                if (c >= 0x80) goto NonAsciiPath;
                if (!std.ascii.isWhite(c)) break;
            }
            input = input[i .. $];
            return input;

        NonAsciiPath:
            input = input[i .. $];
            // Fall through to standard case.
        }}

        static if (ElementType!Range.sizeof > 1)
        {
            // Type performs its own decoding.
            while (!input.empty)
            {
                if (!std.uni.isWhite(input.front))
                    break;
                input.popFront();
            }
            return input;
        }
        else
        {
            // Type doesn't perform its own decoding.
            import std.utf : decodeFront, UseReplacementDchar;
            while (!input.empty)
            {
                auto c = input.front;
                if (std.ascii.isASCII(c))
                {
                    if (!std.ascii.isWhite(c))
                        break;
                    input.popFront();
                }
                else
                {
                    auto save = input.save;
                    auto dc = decodeFront!(UseReplacementDchar.yes)(input);
                    if (!std.uni.isWhite(dc))
                        return save;
                }
            }
            return input;
        }
    }
}

void main(string[] args)
{
    import core.stdc.string;
    import std.datetime.stopwatch;
    import std.stdio;
    import bench_data;

    auto timings = benchmark!(
        ()
        {
            blackhole += old_stripLeft(ascii_1)[0];
        },
        ()
        {
            blackhole += new_stripLeft(ascii_1)[0];
        },
        ()
        {
            blackhole += old_stripLeft(non_ascii_1)[0];
        },
        ()
        {
            blackhole += new_stripLeft(non_ascii_1)[0];
        },
        ()
        {
            blackhole += old_stripLeft(w_ascii_1)[0];
        },
        ()
        {
            blackhole += new_stripLeft(w_ascii_1)[0];
        },
        ()
        {
            blackhole += old_stripLeft(w_non_ascii_1)[0];
        },
        ()
        {
            blackhole += new_stripLeft(w_non_ascii_1)[0];
        },
    )(50_000_000);
    writefln("old_stripLeft ascii %s", timings[0]);
    writefln("new_stripLeft ascii %s", timings[1]);
    writefln("old_stripLeft non_ascii %s", timings[2]);
    writefln("new_stripLeft non_ascii %s", timings[3]);
    writefln("old_stripLeft w_ascii %s", timings[4]);
    writefln("new_stripLeft w_ascii %s", timings[5]);
    writefln("old_stripLeft w_non_ascii %s", timings[6]);
    writefln("new_stripLeft w_non_ascii %s", timings[7]);
 }

bench_data.d

// 2nd module to hide data from optimizer
module bench_data;

immutable string ascii_1 = "\n\t\v\rhello world\n\t\v\r";
immutable string non_ascii_1 = "\u2028\u2028hello world\u2028\u2028";

immutable wstring w_ascii_1 = "\n\t\v\rhello world\n\t\v\r"w;
immutable wstring w_non_ascii_1 = "\u2028\u2028hello world\u2028\u2028"w;

int blackhole;

burner · 2018-10-11T10:01:19Z

could you add a testcase that combines an ascii and non-ascii whitespace, so that the break out statement are correct.

n8sh · 2018-10-11T12:01:47Z

@burner Added.

ghost · 2018-10-14T17:42:31Z

std/string.d

+                size_t s = input.length / 2;
+                input = input[s .. $];
+            }))
+    {{


Enlighten me, why double curly braces here ?

To create a scope. static if curly braces don't do that so I need a second pair.

Sorry i must be tired but i don't see why a scope is needed here.

Here it's not needed, I just prefer it.

n8sh · 2018-10-15T22:50:34Z

Reworked. Added optimizations for wstring and dstring and avoided duplicate decoding for string.

schveiguy

There are two optimizations here. One thing that isn't crystal clear to me is whether the random access usage is needed for anything but arrays, as we don't know the mechanics of generic random-access ranges (and whether indexing is faster than front/popFront). This code would certainly look a lot cleaner without those extra optimizations, especially the char branch.

std/string.d

schveiguy · 2018-10-16T19:26:50Z

std/string.d

+        {{
+            // ASCII optimization for dynamic arrays & similar.
+            size_t i = 0;
+            for (const size_t end = input.length; i < end; ++i)


The suggested modification in the other branch makes this cleaner as well.

Hm... I was thinking this too could be rewritten:

foreach (i; 0 .. input.length) { auto c = input[i]; if (c >= 0x80) { input = input[i .. $]; goto NonAsciiPath; } if (!std.ascii.isWhite) return input[i .. $]; } return input[$ .. $]; NonAsciiPath:

While it doesn't get rid of the goto, the loop looks cleaner to me.

The rewrite makes the ASCII loop slower while avoiding a call to _d_arraybounds. When processing "\n\t\v\rhello world\n\t\v\r", with ldc2 -O2 the function overall takes ~5% longer and with dmd -O -inline it takes ~30% longer. Obviously when -release is added and no bounds checks are performed the comparison gets worse.

This has to be a cache line thing, because the major work done in the loop (calling isWhite and checking if c is non-ascii) is identical in both cases. In any case, it's only marginally more readable, so the original is fine.

schveiguy · 2018-10-16T19:29:33Z

std/string.d

+            for (const size_t end = input.length; i < end; ++i)
+            {
+                auto c = input[i];
+                if (c >= 0x80) goto NonAsciiPath;


Can we avoid a goto here?

I don't think so.

schveiguy · 2018-10-16T19:35:01Z

std/string.d

+            // Fall through to standard case.
+        }}
+
+        static if (ElementType!Range.sizeof > 1)


How about is(ElementType!Range != ElementEncodingType!Range). The sizeof thing requires one to deduce that you only got here if the element encoding type is char.

std/string.d

PetarKirov · 2018-10-17T06:47:28Z

One thing that isn't crystal clear to me is whether the random access usage is needed for anything but arrays, as we don't know the mechanics of generic random-access ranges (and whether indexing is faster than front/popFront).

I haven't had the time to review this PR, just wanted to mention that if you're iterating sequentially, indexing with a CPU predictable index variable (e.g. i++) should be the same as using the range primitives for arrays, however for other ranges like ndslice, it's faster to use popFront as the indexing will repeat a noticable amount of work - suming the offsets for each dimensions O(N), while with popFront that's amortized O(1), where N is the number of dimensions.

n8sh · 2018-10-17T15:48:27Z

Changed the PR to only use an index variable for arrays.

Add ASCII fast path to stripLeft & avoid redundant decoding.

schveiguy · 2018-10-17T16:13:01Z

indexing with a CPU predictable index variable (e.g. i++) should be the same as using the range primitives for arrays

I would think indexing and incrementing would be less expensive than front and popFront for arrays, since popFront updates both the pointer and length. But maybe that is made up for by the fact that you are indexing with a constant?

nordlow · 2018-10-17T19:38:25Z

This is a great idea. There are more functions in Phobos that would benefit from ASCII-optimized paths. I managed to find these

find
canFind
startsWith
endsWith
findSplitBefore
findSplitAfter
findSplit

. I bet there are more.

This will have a big impact on the performance of text processing, such as parsers, as a large percentage of the input is ASCII-clean.

schveiguy · 2018-10-18T14:46:00Z

ping @wilzbach I can't figure out what went wrong in the buildkite. Seems to be consistent, but there is no real error message except that some script failed:

[ERROR] /var/lib/buildkite-agent/builds/buildkite-agent-01-1/dlang/phobos/build/dlang-dub/test/interactive-remove.sh:10 command failed
[ERROR] Script failure.

Is this a false error?

wilzbach · 2018-10-18T14:48:51Z

Yes.

schveiguy · 2018-10-18T14:49:11Z

Auto-merge toggled on

schveiguy · 2018-10-18T14:49:30Z

OK, thanks. Just used the old mechanism instead.

n8sh requested review from burner and JackStouffer as code owners October 10, 2018 17:40

n8sh force-pushed the strip-ascii branch 2 times, most recently from 683b961 to 4fcb911 Compare October 10, 2018 19:15

n8sh changed the title ~~Add ASCII fast path to strip/stripLeft/stripRight~~ Add ASCII fast path to stripLeft Oct 10, 2018

n8sh force-pushed the strip-ascii branch from 4fcb911 to bddbb58 Compare October 11, 2018 12:01

n8sh force-pushed the strip-ascii branch from bddbb58 to e38e2af Compare October 11, 2018 12:32

ghost reviewed Oct 14, 2018

View reviewed changes

n8sh force-pushed the strip-ascii branch from e38e2af to 1c458f0 Compare October 15, 2018 22:31

n8sh changed the title ~~Add ASCII fast path to stripLeft~~ Add ASCII fast path to stripLeft & avoid unnecessary auto-decoding Oct 15, 2018

n8sh force-pushed the strip-ascii branch from 1c458f0 to 266603a Compare October 15, 2018 22:43

n8sh force-pushed the strip-ascii branch 6 times, most recently from 976b466 to d38016a Compare October 16, 2018 05:45

n8sh mentioned this pull request Oct 16, 2018

Allow no-arg splitter function to work on ranges that aren't standard strings #6700

Merged

schveiguy reviewed Oct 16, 2018

View reviewed changes

n8sh force-pushed the strip-ascii branch 5 times, most recently from 7094a4f to 82e0f2c Compare October 17, 2018 01:11

dlang-bot added the Enhancement label Oct 17, 2018

n8sh added the optimization label Oct 17, 2018

n8sh force-pushed the strip-ascii branch 2 times, most recently from f10c0b8 to 94a38a2 Compare October 17, 2018 01:25

n8sh changed the title ~~Add ASCII fast path to stripLeft & avoid unnecessary auto-decoding~~ Add ASCII fast path to stripLeft & avoid unnecessary decoding Oct 17, 2018

n8sh force-pushed the strip-ascii branch from 94a38a2 to 3262455 Compare October 17, 2018 15:46

Fix issue 19308 - optimize stripLeft

2e6c538

Add ASCII fast path to stripLeft & avoid redundant decoding.

n8sh force-pushed the strip-ascii branch from 3262455 to 2e6c538 Compare October 17, 2018 15:58

schveiguy approved these changes Oct 18, 2018

View reviewed changes

schveiguy added the auto-merge label Oct 18, 2018

schveiguy merged commit 4e6f70a into dlang:master Oct 18, 2018

n8sh mentioned this pull request Nov 16, 2018

Fix Issue 19404 - Optimize std.string.stripRight #6770

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add ASCII fast path to stripLeft & avoid unnecessary decoding #6727

Add ASCII fast path to stripLeft & avoid unnecessary decoding #6727

n8sh commented Oct 10, 2018 •

edited

dlang-bot commented Oct 10, 2018 •

edited

n8sh commented Oct 10, 2018 •

edited

burner commented Oct 11, 2018

n8sh commented Oct 11, 2018

ghost Oct 14, 2018

n8sh Oct 14, 2018

ghost Oct 14, 2018

n8sh Oct 14, 2018

n8sh commented Oct 15, 2018

schveiguy left a comment

schveiguy Oct 16, 2018

schveiguy Oct 17, 2018

n8sh Oct 18, 2018

schveiguy Oct 18, 2018

schveiguy Oct 16, 2018

n8sh Oct 16, 2018

schveiguy Oct 16, 2018

PetarKirov commented Oct 17, 2018

n8sh commented Oct 17, 2018

schveiguy commented Oct 17, 2018

nordlow commented Oct 17, 2018 •

edited

schveiguy commented Oct 18, 2018

wilzbach commented Oct 18, 2018

schveiguy commented Oct 18, 2018

schveiguy commented Oct 18, 2018

Add ASCII fast path to stripLeft & avoid unnecessary decoding #6727

Add ASCII fast path to stripLeft & avoid unnecessary decoding #6727

Conversation

n8sh commented Oct 10, 2018 • edited

dlang-bot commented Oct 10, 2018 • edited

Bugzilla references

Testing this PR locally

n8sh commented Oct 10, 2018 • edited

burner commented Oct 11, 2018

n8sh commented Oct 11, 2018

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

n8sh commented Oct 15, 2018

schveiguy left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

PetarKirov commented Oct 17, 2018

n8sh commented Oct 17, 2018

schveiguy commented Oct 17, 2018

nordlow commented Oct 17, 2018 • edited

schveiguy commented Oct 18, 2018

wilzbach commented Oct 18, 2018

schveiguy commented Oct 18, 2018

schveiguy commented Oct 18, 2018

n8sh commented Oct 10, 2018 •

edited

dlang-bot commented Oct 10, 2018 •

edited

n8sh commented Oct 10, 2018 •

edited

nordlow commented Oct 17, 2018 •

edited