From dd5eb5ff97736ecaec525488413382f94c0053ea Mon Sep 17 00:00:00 2001 From: jmdavis Date: Wed, 24 Oct 2012 22:28:34 -0700 Subject: [PATCH] Fix for issue# 8890: commonPrefix does not handle unicode correctly. It was returning partial code points if the first few code units in a code point matched but not the entire code point. --- std/algorithm.d | 110 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 14 deletions(-) diff --git a/std/algorithm.d b/std/algorithm.d index 269288e759c..9b4cfb293a8 100644 --- a/std/algorithm.d +++ b/std/algorithm.d @@ -317,7 +317,7 @@ module std.algorithm; import std.c.string, core.bitop; import std.array, std.ascii, std.container, std.conv, std.exception, std.functional, std.math, std.metastrings, std.range, std.string, - std.traits, std.typecons, std.typetuple, std.uni; + std.traits, std.typecons, std.typetuple, std.uni, std.utf; version(unittest) { @@ -5226,15 +5226,19 @@ Returns the common prefix of two ranges. Example: assert(commonPrefix("hello, world", "hello, there") == "hello, "); ---- -The type of the result is the same as $(D takeExactly(r1, n)), where -$(D n) is the number of elements that both ranges start with. +For strings, the result is a slice of $(D r1) which contains the characters that +both strings start with. For all other types, the type of the result is the +same as the result of $(D takeExactly(r1, n)), where $(D n) is the number of +elements that both ranges start with. */ auto commonPrefix(alias pred = "a == b", R1, R2)(R1 r1, R2 r2) -if (isForwardRange!R1 && isForwardRange!R2) +if (isForwardRange!R1 && isForwardRange!R2 && + !isNarrowString!R1 && !isNarrowString!R2 && + is(typeof(binaryFun!pred(r1.front, r2.front)))) { - static if (isSomeString!R1 && isSomeString!R2 - && ElementEncodingType!R1.sizeof == ElementEncodingType!R2.sizeof - || isRandomAccessRange!R1 && hasLength!R2) + static if (isRandomAccessRange!R1 && isRandomAccessRange!R2 && + hasLength!R1 && hasLength!R2 && + hasSlicing!R1) { immutable limit = min(r1.length, r2.length); foreach (i; 0 .. limit) @@ -5250,21 +5254,99 @@ if (isForwardRange!R1 && isForwardRange!R2) { auto result = r1.save; size_t i = 0; - for (; !r1.empty && !r2.empty && binaryFun!pred(r1.front, r2.front); + for (; + !r1.empty && !r2.empty && binaryFun!pred(r1.front, r2.front); ++i, r1.popFront(), r2.popFront()) + {} + return takeExactly(result, i); + } +} + +auto commonPrefix(alias pred, R1, R2)(R1 r1, R2 r2) +if (isSomeString!R1 && isSomeString!R2 && + !(!isNarrowString!R1 && !isNarrowString!R2) && + is(typeof(binaryFun!pred(r1.front, r2.front)))) +{ + auto result = r1.save; + immutable len = r1.length; + size_t i = 0; + + for (size_t j = 0; i < len && !r2.empty; r2.popFront(), i = j) + { + immutable f = decode(r1, j); + if (!binaryFun!pred(f, r2.front)) + break; + } + + return result[0 .. i]; +} + +auto commonPrefix(R1, R2)(R1 r1, R2 r2) +if (isSomeString!R1 && isSomeString!R2 && !(!isNarrowString!R1 && !isNarrowString!R2)) +{ + static if (ElementEncodingType!R1.sizeof == ElementEncodingType!R2.sizeof) + { + immutable limit = min(r1.length, r2.length); + for (size_t i = 0; i < limit;) { + immutable codeLen = std.utf.stride(r1, i); + size_t j = 0; + + for (; j < codeLen && i < limit; ++i, ++j) + { + if (r1[i] != r2[i]) + return r1[0 .. i - j]; + } + + if (i == limit && j < codeLen) + throw new UTFException("Invalid UTF-8 sequence", i); } - return takeExactly(result, i); + return r1[0 .. limit]; } + else + return commonPrefix!"a == b"(r1, r2); } unittest { - assert(commonPrefix("hello, world", "hello, there") == "hello, "); - assert(commonPrefix("hello, ", "hello, world") == "hello, "); - assert(equal(commonPrefix("hello, world", "hello, there"w), "hello, ")); - assert(equal(commonPrefix("hello, world"w, "hello, there"), "hello, ")); - assert(equal(commonPrefix("hello, world", "hello, there"d), "hello, ")); + assert(commonPrefix([1, 2, 3], [1, 2, 3, 4, 5]) == [1, 2, 3]); + assert(commonPrefix([1, 2, 3, 4, 5], [1, 2, 3]) == [1, 2, 3]); + assert(commonPrefix([1, 2, 3, 4], [1, 2, 3, 4]) == [1, 2, 3, 4]); + assert(commonPrefix([1, 2, 3], [7, 2, 3, 4, 5]).empty); + assert(commonPrefix([7, 2, 3, 4, 5], [1, 2, 3]).empty); + assert(commonPrefix([1, 2, 3], cast(int[])null).empty); + assert(commonPrefix(cast(int[])null, [1, 2, 3]).empty); + assert(commonPrefix(cast(int[])null, cast(int[])null).empty); + + foreach (S; TypeTuple!(char[], const(char)[], string, + wchar[], const(wchar)[], wstring, + dchar[], const(dchar)[], dstring)) + { + foreach(T; TypeTuple!(string, wstring, dstring)) + { + assert(commonPrefix(to!S(""), to!T("")).empty); + assert(commonPrefix(to!S(""), to!T("hello")).empty); + assert(commonPrefix(to!S("hello"), to!T("")).empty); + assert(commonPrefix(to!S("hello, world"), to!T("hello, there")) == to!S("hello, ")); + assert(commonPrefix(to!S("hello, there"), to!T("hello, world")) == to!S("hello, ")); + assert(commonPrefix(to!S("hello, "), to!T("hello, world")) == to!S("hello, ")); + assert(commonPrefix(to!S("hello, world"), to!T("hello, ")) == to!S("hello, ")); + assert(commonPrefix(to!S("hello, world"), to!T("hello, world")) == to!S("hello, world")); + + //Bug# 8890 + assert(commonPrefix(to!S("Пиво"), to!T("Пони"))== to!S("П")); + assert(commonPrefix(to!S("Пони"), to!T("Пиво"))== to!S("П")); + assert(commonPrefix(to!S("Пиво"), to!T("Пиво"))== to!S("Пиво")); + assert(commonPrefix(to!S("\U0010FFFF\U0010FFFB\U0010FFFE"), + to!T("\U0010FFFF\U0010FFFB\U0010FFFC")) == to!S("\U0010FFFF\U0010FFFB")); + assert(commonPrefix(to!S("\U0010FFFF\U0010FFFB\U0010FFFC"), + to!T("\U0010FFFF\U0010FFFB\U0010FFFE")) == to!S("\U0010FFFF\U0010FFFB")); + assert(commonPrefix!"a != b"(to!S("Пиво"), to!T("онво")) == to!S("Пи")); + assert(commonPrefix!"a != b"(to!S("онво"), to!T("Пиво")) == to!S("он")); + } + } + + assertThrown!UTFException(commonPrefix("\U0010FFFF\U0010FFFB", "\U0010FFFF\U0010FFFB"[0 .. $ - 1])); } // findAdjacent