Skip to content

Commit

Permalink
Merge pull request #6113 from jmdavis/issue15949
Browse files Browse the repository at this point in the history
Fix issue 15949: Make readText check BOMs.
merged-on-behalf-of: Vladimir Panteleev <github@thecybershadow.net>
  • Loading branch information
dlang-bot committed Feb 14, 2018
2 parents aebfc2d + 5d52a81 commit d43925e
Show file tree
Hide file tree
Showing 2 changed files with 235 additions and 27 deletions.
20 changes: 20 additions & 0 deletions changelog/std-file-readText.dd
@@ -0,0 +1,20 @@
readText now checks BOMs

$(REF readText, std, file) now checks for a
$(HTTP https://en.wikipedia.org/wiki/Byte_order_mark, BOM). If a BOM is present
and it is for UTF-8, UTF-16, or UTF-32, $(REF readText, std, file) verifies
that it matches the requested string type and the endianness of the machine,
and if there is a mismatch, a $(REF UTFException, std, utf) is thrown without
bothering to validate the string.

If there is no BOM, or if the BOM is not for UTF-8, UTF-16, or UTF-32, then the
behavior is what it's always been, and UTF validation continues as per normal,
so if the text isn't valid for the requested string type, a
$(REF UTFException, std, utf) will be thrown.

In addition, before the buffer is cast to the requested string type, the
alignment is checked (e.g. 5 bytes don't fit cleanly in an array of $(D wchar)
or $(D dchar)), and a $(REF UTFException, std, utf) is now throw if the
number of bytes does not align with the requested string type. Previously, the
alignment was not checked before casting, so if there was an alignment mismatch,
the cast would throw an Error, killing the program.
242 changes: 215 additions & 27 deletions std/file.d
Expand Up @@ -465,56 +465,244 @@ version (linux) @safe unittest
assert(read(deleteme) == "abcd");
}

/********************************************
Read and validates (using $(REF validate, std,utf)) a text file. $(D S)
can be a type of array of characters of any width and constancy. No
width conversion is performed; if the width of the characters in file
$(D name) is different from the width of elements of $(D S),
validation will fail.
Params:
name = string or range of characters representing the file _name
/++
Reads and validates (using $(REF validate, std, utf)) a text file. S can be
an array of any character type. However, no width or endian conversions are
performed. So, if the width or endianness of the characters in the given
file differ from the width or endianness of the element type of S, then
validation will fail.
Returns: Array of characters read.
Params:
name = string or range of characters representing the file _name
Throws: $(D FileException) on file error, $(D UTFException) on UTF
decoding error.
*/
Returns: Array of characters read.
S readText(S = string, R)(R name)
if (isSomeString!S &&
(isInputRange!R && !isInfinite!R && isSomeChar!(ElementEncodingType!R) || isSomeString!R) &&
!isConvertibleToString!R)
Throws: $(LREF FileException) if there is an error reading the file,
$(REF UTFException, std, utf) on UTF decoding error.
+/
S readText(S = string, R)(auto ref R name)
if (isSomeString!S && (isInputRange!R && !isInfinite!R && isSomeChar!(ElementType!R) || is(StringTypeOf!R)))
{
import std.utf : validate;
static auto trustedCast(void[] buf) @trusted { return cast(S) buf; }
auto result = trustedCast(read(name));
import std.algorithm.searching : startsWith;
import std.encoding : getBOM, BOM;
import std.exception : enforce;
import std.format : format;
import std.utf : UTFException, validate;

static if (is(StringTypeOf!R))
StringTypeOf!R filename = name;
else
auto filename = name;

static auto trustedCast(T)(void[] buf) @trusted { return cast(T) buf; }
auto data = trustedCast!(ubyte[])(read(filename));

immutable bomSeq = getBOM(data);
immutable bom = bomSeq.schema;

static if (is(Unqual!(ElementEncodingType!S) == char))
{
with(BOM) switch (bom)
{
case utf16be:
case utf16le: throw new UTFException("UTF-8 requested. BOM is for UTF-16");
case utf32be:
case utf32le: throw new UTFException("UTF-8 requested. BOM is for UTF-32");
default: break;
}
}
else static if (is(Unqual!(ElementEncodingType!S) == wchar))
{
with(BOM) switch (bom)
{
case utf8: throw new UTFException("UTF-16 requested. BOM is for UTF-8");
case utf16be:
{
version(BigEndian)
break;
else
throw new UTFException("BOM is for UTF-16 LE on Big Endian machine");
}
case utf16le:
{
version(BigEndian)
throw new UTFException("BOM is for UTF-16 BE on Little Endian machine");
else
break;
}
case utf32be:
case utf32le: throw new UTFException("UTF-8 requested. BOM is for UTF-32");
default: break;
}
}
else
{
with(BOM) switch (bom)
{
case utf8: throw new UTFException("UTF-16 requested. BOM is for UTF-8");
case utf16be:
case utf16le: throw new UTFException("UTF-8 requested. BOM is for UTF-16");
case utf32be:
{
version(BigEndian)
break;
else
throw new UTFException("BOM is for UTF-32 LE on Big Endian machine");
}
case utf32le:
{
version(BigEndian)
throw new UTFException("BOM is for UTF-32 BE on Little Endian machine");
else
break;
}
default: break;
}
}

if (data.length % ElementEncodingType!S.sizeof != 0)
throw new UTFException(format!"The content of %s is not UTF-%s"(filename, ElementEncodingType!S.sizeof * 8));

auto result = trustedCast!S(data);
validate(result);
return result;
}

///
// Read file with UTF-8 text.
@safe unittest
{
import std.exception : enforce;
write(deleteme, "abc"); // deleteme is the name of a temporary file
scope(exit) remove(deleteme);
string content = readText(deleteme);
enforce(content == "abc");
assert(content == "abc");
}

/// ditto
S readText(S = string, R)(auto ref R name)
if (isConvertibleToString!R)
// Read file with UTF-8 text but try to read it as UTF-16.
@safe unittest
{
import std.exception : assertThrown;
import std.utf : UTFException;

write(deleteme, "abc");
scope(exit) remove(deleteme);
// Throws because the file is not valid UTF-16.
assertThrown!UTFException(readText!wstring(deleteme));
}

// Read file with UTF-16 text.
@safe unittest
{
return readText!(S, StringTypeOf!R)(name);
import std.algorithm.searching : skipOver;

write(deleteme, "\uFEFFabc"w); // With BOM
scope(exit) remove(deleteme);
auto content = readText!wstring(deleteme);
assert(content == "\uFEFFabc"w);
// Strips BOM if present.
content.skipOver('\uFEFF');
assert(content == "abc"w);
}

@safe unittest
{
static assert(__traits(compiles, readText(TestAliasedString(null))));
}

@system unittest
{
import std.array : appender;
import std.bitmanip : append, Endian;
import std.exception : assertThrown;
import std.path : buildPath;
import std.string : representation;
import std.utf : UTFException;

mkdir(deleteme);
scope(exit) rmdirRecurse(deleteme);

immutable none8 = buildPath(deleteme, "none8");
immutable none16 = buildPath(deleteme, "none16");
immutable utf8 = buildPath(deleteme, "utf8");
immutable utf16be = buildPath(deleteme, "utf16be");
immutable utf16le = buildPath(deleteme, "utf16le");
immutable utf32be = buildPath(deleteme, "utf32be");
immutable utf32le = buildPath(deleteme, "utf32le");
immutable utf7 = buildPath(deleteme, "utf7");

write(none8, "京都市");
write(none16, "京都市"w);
write(utf8, (cast(char[])[0xEF, 0xBB, 0xBF]) ~ "京都市");
{
auto str = "\uFEFF京都市"w;
auto arr = appender!(ubyte[])();
foreach (c; str)
arr.append(c);
write(utf16be, arr.data);
}
{
auto str = "\uFEFF京都市"w;
auto arr = appender!(ubyte[])();
foreach (c; str)
arr.append!(ushort, Endian.littleEndian)(c);
write(utf16le, arr.data);
}
{
auto str = "\U0000FEFF京都市"d;
auto arr = appender!(ubyte[])();
foreach (c; str)
arr.append(c);
write(utf32be, arr.data);
}
{
auto str = "\U0000FEFF京都市"d;
auto arr = appender!(ubyte[])();
foreach (c; str)
arr.append!(uint, Endian.littleEndian)(c);
write(utf32le, arr.data);
}
write(utf7, (cast(ubyte[])[0x2B, 0x2F, 0x76, 0x38, 0x2D]) ~ "foobar".representation);

assertThrown!UTFException(readText(none16));
assert(readText(utf8) == (cast(char[])[0xEF, 0xBB, 0xBF]) ~ "京都市");
assertThrown!UTFException(readText(utf16be));
assertThrown!UTFException(readText(utf16le));
assertThrown!UTFException(readText(utf32be));
assertThrown!UTFException(readText(utf32le));
assert(readText(utf7) == (cast(char[])[0x2B, 0x2F, 0x76, 0x38, 0x2D]) ~ "foobar");

assertThrown!UTFException(readText!wstring(none8));
assert(readText!wstring(none16) == "京都市"w);
assertThrown!UTFException(readText!wstring(utf8));
version(BigEndian)
{
assert(readText!wstring(utf16be) == "\uFEFF京都市"w);
assertThrown!UTFException(readText!wstring(utf16le));
}
else
{
assertThrown!UTFException(readText!wstring(utf16be));
assert(readText!wstring(utf16le) == "\uFEFF京都市"w);
}
assertThrown!UTFException(readText!wstring(utf32be));
assertThrown!UTFException(readText!wstring(utf32le));
assertThrown!UTFException(readText!wstring(utf7));

assertThrown!UTFException(readText!dstring(utf8));
assertThrown!UTFException(readText!dstring(utf16be));
assertThrown!UTFException(readText!dstring(utf16le));
version(BigEndian)
{
assert(readText!dstring(utf32be) == "\U0000FEFF京都市"d);
assertThrown!UTFException(readText!dstring(utf32le));
}
else
{
assertThrown!UTFException(readText!dstring(utf32be));
assert(readText!dstring(utf32le) == "\U0000FEFF京都市"d);
}
assertThrown!UTFException(readText!dstring(utf7));
}

/*********************************************
Write $(D buffer) to file $(D name).
Expand Down

0 comments on commit d43925e

Please sign in to comment.