frostney · frostney · Apr 5, 2026
diff --git a/docs/testing.md b/docs/testing.md
@@ -253,6 +253,8 @@ python3 scripts/run_toml_test_suite.py --harness=./build/GocciaTOMLCheck --outpu
 
 Unlike the YAML script, this harness compares both parse/fail behavior and the official tagged JSON fixtures for valid cases. It uses a Pascal decoder built around `TGocciaTOMLParser.ParseDocument(...)` so TOML scalar kinds like `integer`, `float`, `datetime`, `datetime-local`, `date-local`, and `time-local` remain visible during compliance checks even though the normal runtime surface still maps date/time values to strings.
 
+The harness reads TOML suite files as raw UTF-8 bytes and preserves that UTF-8 code page when passing text into the Pascal parser. That extra step matters on Windows, where an implicit conversion through the local ANSI code page would otherwise corrupt Unicode keys and string values and produce false TOML compliance failures.
+
 The TOML runner exits non-zero when any case fails or times out, so it is safe to use directly in CI. When `--harness` is omitted it compiles `scripts/GocciaTOMLCheck.dpr` automatically; CI uses a prebuilt harness from the matrix build artifacts instead.
 
 ### Run Pascal Unit Tests

diff --git a/scripts/GocciaTOMLCheck.dpr b/scripts/GocciaTOMLCheck.dpr
@@ -12,15 +12,22 @@ uses
   Goccia.TOML,
   Goccia.Values.Primitives;
 
-function LoadUTF8File(const APath: string): UTF8String;
+function LoadUTF8File(const APath: string): string;
+const
+  UTF8_CODE_PAGE = 65001;
 var
   Stream: TFileStream;
+  UTF8Text: RawByteString;
 begin
   Stream := TFileStream.Create(APath, fmOpenRead or fmShareDenyWrite);
   try
-    SetLength(Result, Stream.Size);
-    if Length(Result) > 0 then
-      Stream.ReadBuffer(Pointer(Result)^, Length(Result));
+    SetLength(UTF8Text, Stream.Size);
+    if Length(UTF8Text) > 0 then
+    begin
+      Stream.ReadBuffer(Pointer(UTF8Text)^, Length(UTF8Text));
+      SetCodePage(UTF8Text, UTF8_CODE_PAGE, False);
+    end;
+    Result := UTF8Text;
   finally
     Stream.Free;
   end;
@@ -134,7 +141,7 @@ var
   ExitCode: Integer;
   Parser: TGocciaTOMLParser;
   Root: TGocciaTOMLNode;
-  SourceText: UTF8String;
+  SourceText: string;
 begin
   if ParamCount <> 1 then
     Halt(2);
@@ -146,7 +153,7 @@ begin
   try
     try
       SourceText := LoadUTF8File(ParamStr(1));
-      Root := Parser.ParseDocument(string(SourceText));
+      Root := Parser.ParseDocument(SourceText);
       try
         WriteLn(SerializeNode(Root));
         ExitCode := 0;

diff --git a/units/Goccia.TOML.Test.pas b/units/Goccia.TOML.Test.pas
@@ -16,6 +16,7 @@   TTOMLParserTests = class(TTestSuite)
     function GetChildOrFail(const AParent: TGocciaTOMLNode;
       const AKey: string): TGocciaTOMLNode;
     procedure TestParseDocumentNormalizesCRLFInMultilineStrings;
+    procedure TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues;
     procedure TestParseDocumentTracksScalarKinds;
     procedure TestParseDocumentTracksArrayElementKinds;
   public
@@ -30,6 +31,8 @@ procedure TTOMLParserTests.SetupTests;
     TestParseDocumentTracksArrayElementKinds);
   Test('ParseDocument normalizes CRLF in multiline strings',
     TestParseDocumentNormalizesCRLFInMultilineStrings);
+  Test('ParseDocument preserves UTF-8 bytes for Unicode keys and values',
+    TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues);
 end;
 
 function TTOMLParserTests.GetChildOrFail(const AParent: TGocciaTOMLNode;
@@ -150,6 +153,46 @@ procedure TTOMLParserTests.TestParseDocumentNormalizesCRLFInMultilineStrings;
   end;
 end;
 
+procedure TTOMLParserTests.TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues;
+const
+  UTF8_CODE_PAGE = 65001;
+var
+  Parser: TGocciaTOMLParser;
+  RawSourceText: RawByteString;
+  RawUnicodeKey: RawByteString;
+  RawUnicodeValue: RawByteString;
+  Root: TGocciaTOMLNode;
+  SourceText: string;
+  UnicodeKey: string;
+  UnicodeNode: TGocciaTOMLNode;
+  UnicodeValue: string;
+begin
+  RawUnicodeKey := #$CE#$B4;
+  SetCodePage(RawUnicodeKey, UTF8_CODE_PAGE, False);
+  UnicodeKey := RawUnicodeKey;
+
+  RawUnicodeValue := 'Jos' + #$C3#$A9;
+  SetCodePage(RawUnicodeValue, UTF8_CODE_PAGE, False);
+  UnicodeValue := RawUnicodeValue;
+
+  RawSourceText := '"' + RawUnicodeKey + '" = "' + RawUnicodeValue + '"' + LineEnding;
+  SetCodePage(RawSourceText, UTF8_CODE_PAGE, False);
+  SourceText := RawSourceText;
+
+  Parser := TGocciaTOMLParser.Create;
+  try
+    Root := Parser.ParseDocument(SourceText);
+    try
+      UnicodeNode := GetChildOrFail(Root, UnicodeKey);
+      Expect<string>(UnicodeNode.CanonicalValue).ToBe(UnicodeValue);
+    finally
+      Root.Free;
+    end;
+  finally
+    Parser.Free;
+  end;
+end;
+
 begin
   TestRunnerProgram.AddSuite(TTOMLParserTests.Create('TOML Parser'));
   TestRunnerProgram.Run;