Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix issue 18789 - std.stdio messes up UTF conversions on output #6469

Merged
merged 5 commits into from Oct 6, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
142 changes: 116 additions & 26 deletions std/stdio.d
Expand Up @@ -2923,14 +2923,16 @@ is empty, throws an `Exception`. In case of an I/O error throws
// the file's orientation (byte- or wide-oriented)
int orientation_;

// A buffer for when we need to transcode.
// Buffers for when we need to transcode.
wchar highSurrogate = '\0'; // '\0' indicates empty
void highSurrogateShouldBeEmpty() @safe
{
import std.utf : UTFException;
if (highSurrogate != '\0')
throw new UTFException("unpaired surrogate UTF-16 value");
}
char[4] rbuf8;
size_t rbuf8Filled = 0;
public:

this(ref File f) @trusted
Expand Down Expand Up @@ -3018,6 +3020,7 @@ is empty, throws an `Exception`. In case of an I/O error throws
void put(C)(scope C c) @safe if (isSomeChar!C || is(C : const(ubyte)))
{
import std.traits : Parameters;
import std.utf : decodeFront, encode, stride;
static auto trustedFPUTC(int ch, _iobuf* h) @trusted
{
return FPUTC(ch, h);
Expand All @@ -3029,46 +3032,70 @@ is empty, throws an `Exception`. In case of an I/O error throws

static if (c.sizeof == 1)
{
// simple char
highSurrogateShouldBeEmpty();
if (orientation_ <= 0) trustedFPUTC(c, handle_);
else trustedFPUTWC(c, handle_);
else if (c <= 0x7F) trustedFPUTWC(c, handle_);
else if (c >= 0b1100_0000) // start byte of multibyte sequence
{
rbuf8[0] = c;
rbuf8Filled = 1;
}
else // continuation byte of multibyte sequence
{
rbuf8[rbuf8Filled] = c;
++rbuf8Filled;
if (stride(rbuf8[]) == rbuf8Filled) // sequence is complete
{
char[] str = rbuf8[0 .. rbuf8Filled];
immutable dchar d = decodeFront(str);
wchar_t[4 / wchar_t.sizeof] wbuf;
immutable size = encode(wbuf, d);
foreach (i; 0 .. size)
trustedFPUTWC(wbuf[i], handle_);
rbuf8Filled = 0;
}
}
}
else static if (c.sizeof == 2)
{
import std.utf : encode, decode;
import std.utf : decode;

if (orientation_ <= 0)
if (c <= 0x7F)
{
if (c <= 0x7F)
{
highSurrogateShouldBeEmpty();
trustedFPUTC(c, handle_);
}
else if (0xD800 <= c && c <= 0xDBFF) // high surrogate
highSurrogateShouldBeEmpty();
if (orientation_ <= 0) trustedFPUTC(c, handle_);
else trustedFPUTWC(c, handle_);
}
else if (0xD800 <= c && c <= 0xDBFF) // high surrogate
{
highSurrogateShouldBeEmpty();
highSurrogate = c;
}
else // standalone or low surrogate
{
dchar d = c;
if (highSurrogate != '\0')
{
highSurrogateShouldBeEmpty();
highSurrogate = c;
immutable wchar[2] rbuf = [highSurrogate, c];
size_t index = 0;
d = decode(rbuf[], index);
highSurrogate = 0;
}
else // standalone or low surrogate
if (orientation_ <= 0)
{
dchar d = c;
if (highSurrogate != '\0')
{
immutable wchar[2] rbuf = [highSurrogate, c];
size_t index = 0;
d = decode(rbuf[], index);
highSurrogate = 0;
}
char[4] wbuf;
immutable size = encode(wbuf, d);
foreach (i; 0 .. size)
trustedFPUTC(wbuf[i], handle_);
}
}
else
{
trustedFPUTWC(c, handle_);
else
{
wchar_t[4 / wchar_t.sizeof] wbuf;
immutable size = encode(wbuf, d);
foreach (i; 0 .. size)
trustedFPUTWC(wbuf[i], handle_);
}
rbuf8Filled = 0;
}
}
else // 32-bit characters
Expand Down Expand Up @@ -3640,6 +3667,69 @@ void main()
}
assert(std.file.readText!string(deleteme).stripLeft("\uFEFF") == "foobar");
}
@safe unittest // char/wchar -> wchar_t
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason for this unittest to not just be @system , what with all the @trusted lambdas?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing that's actually being tested is writer.put(s). And we might want to ensure that that's @safe. The @trusted parts are just setup for the test.

But truth be told, I just started with @safe and added @trusted as needed. I can change it to @system if you like that better.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fair enough.

{
import core.stdc.locale : LC_CTYPE, setlocale;
import core.stdc.wchar_ : fwide;
import std.algorithm.searching : any, endsWith;
import std.conv : text;
import std.meta : AliasSeq;
import std.string : fromStringz, stripLeft;
static import std.file;
auto deleteme = testFilename();
scope(exit) std.file.remove(deleteme);
const char* oldCt = () @trusted {
return setlocale(LC_CTYPE, null);
}();
const utf8 = ["en_US.UTF-8", "C.UTF-8", ".65001"].any!((loc) @trusted {
return setlocale(LC_CTYPE, loc.ptr).fromStringz.endsWith(loc);
});
scope(exit) () @trusted { setlocale(LC_CTYPE, oldCt); } ();
version (DIGITAL_MARS_STDIO) // DM can't handle Unicode above U+07FF.
{
alias strs = AliasSeq!("xä\u07FE", "yö\u07FF"w);
}
else
{
alias strs = AliasSeq!("xä\U0001F607", "yö\U0001F608"w);
}
{
auto f = File(deleteme, "w");
version (MICROSOFT_STDIO)
{
() @trusted { setmode(fileno(f.getFP()), _O_U8TEXT); } ();
}
else
{
assert(fwide(f.getFP(), 1) == 1);
}
auto writer = f.lockingTextWriter();
assert(writer.orientation_ == 1);
static foreach (s; strs) writer.put(s);
}
assert(std.file.readText!string(deleteme).stripLeft("\uFEFF") ==
text(strs));
}
@safe unittest // https://issues.dlang.org/show_bug.cgi?id=18789
{
static import std.file;
auto deleteme = testFilename();
scope(exit) std.file.remove(deleteme);
// converting to char
{
auto f = File(deleteme, "w");
f.writeln("\U0001F608"w); // UTFException
}
// converting to wchar_t
{
auto f = File(deleteme, "w,ccs=UTF-16LE");
// from char
f.writeln("ö"); // writes garbage
f.writeln("\U0001F608"); // ditto
// from wchar
f.writeln("\U0001F608"w); // leads to ErrnoException
}
}

@safe unittest
{
Expand Down