Skip to content

Commit

Permalink
Merge pull request #3880 from burner/bom2
Browse files Browse the repository at this point in the history
Byte Order Mark (BOM) handling functions rewrite
  • Loading branch information
schveiguy committed May 2, 2016
2 parents aa8cf86 + 3d37aee commit 30587d4
Showing 1 changed file with 172 additions and 0 deletions.
172 changes: 172 additions & 0 deletions std/encoding.d
Original file line number Diff line number Diff line change
Expand Up @@ -3361,3 +3361,175 @@ version(unittest)
return "0123456789ABCDEF"[n & 0xF];
}
}

import std.typecons;

/** Definitions of common Byte Order Marks.
The elements of the $(D enum) can used as indices into $(D bomTable) to get
matching $(D BOMSeq).
*/
enum BOM
{
none = 0, /// no BOM was found
utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38]
[0x2B, 0x2F, 0x76, 0x39],
[0x2B, 0x2F, 0x76, 0x2B],
[0x2B, 0x2F, 0x76, 0x2F],
[0x2B, 0x2F, 0x76, 0x38, 0x2D]
*/
utf1 = 8, /// [0xF7, 0x64, 0x4C]
utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
scsu = 10, /// [0x0E, 0xFE, 0xFF]
bocu1 = 11, /// [0xFB, 0xEE, 0x28]
gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
utf8 = 13, /// [0xEF, 0xBB, 0xBF]
utf16be = 14, /// [0xFE, 0xFF]
utf16le = 15 /// [0xFF, 0xFE]
}

/// The type stored inside $(D bomTable).
alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");

/** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
*/
immutable bomTable = [
BOMSeq(BOM.none, null),
BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
];

/** Returns a $(D BOMSeq) for a given $(D input).
If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
returned. The $(D BOM) sequence at the beginning of the range will
not be comsumed from the passed range. If you pass a reference type
range make sure that $(D save) creates a deep copy.
Params:
input = The sequence to check for the $(D BOM)
Returns:
the found $(D BOMSeq) corresponding to the passed $(D input).
*/
immutable(BOMSeq) getBOM(Range)(Range input)
if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
{
import std.algorithm.searching : startsWith;
foreach (it; bomTable[1 .. $])
{
if (startsWith(input.save, it.sequence))
{
return it;
}
}

return bomTable[0];
}

///
unittest
{
import std.format : format;

auto ts = dchar(0x0000FEFF) ~ "Hello World"d;

auto entry = getBOM(cast(ubyte[])ts);
version(BigEndian)
{
assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
}
else
{
assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
}
}

unittest
{
import std.format : format;

foreach (idx, it; bomTable)
{
auto s = it[1] ~ cast(ubyte[])"hello world";
auto i = getBOM(s);
assert(i[0] == bomTable[idx][0]);

if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
{
assert(i[0] == BOM.init + idx);
assert(i[1] == it[1]);
}
}
}

unittest
{
struct BOMInputRange
{
ubyte[] arr;

@property ubyte front()
{
return this.arr.front;
}

@property bool empty()
{
return this.arr.empty;
}

void popFront()
{
this.arr = this.arr[1 .. $];
}

@property typeof(this) save()
{
return this;
}
}

static assert( isInputRange!BOMInputRange);
static assert(!isArray!BOMInputRange);

ubyte[] dummyEnd = [0,0,0,0];

foreach (idx, it; bomTable[1 .. $])
{
{
auto ir = BOMInputRange(it.sequence.dup);

auto b = getBOM(ir);
assert(b.schema == it.schema);
assert(ir.arr == it.sequence);
}

{
auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
size_t oldLen = noBom.length;
assert(oldLen - 4 < it.sequence.length);

auto ir = BOMInputRange(noBom.dup);
auto b = getBOM(ir);
assert(b.schema == BOM.none);
assert(noBom.length == oldLen);
}
}
}

/** Constant defining a fully decoded BOM */
enum dchar utfBOM = 0xfeff;

0 comments on commit 30587d4

Please sign in to comment.