From 242d40864011374eabc4323c71fca96bfe006587 Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Sun, 5 Apr 2026 10:36:45 +0100 Subject: [PATCH] Preserve UTF-8 in TOML compliance harness --- docs/testing.md | 2 ++ scripts/GocciaTOMLCheck.dpr | 19 ++++++++++------ units/Goccia.TOML.Test.pas | 43 +++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/docs/testing.md b/docs/testing.md index dee5d651..ca03294d 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -253,6 +253,8 @@ python3 scripts/run_toml_test_suite.py --harness=./build/GocciaTOMLCheck --outpu Unlike the YAML script, this harness compares both parse/fail behavior and the official tagged JSON fixtures for valid cases. It uses a Pascal decoder built around `TGocciaTOMLParser.ParseDocument(...)` so TOML scalar kinds like `integer`, `float`, `datetime`, `datetime-local`, `date-local`, and `time-local` remain visible during compliance checks even though the normal runtime surface still maps date/time values to strings. +The harness reads TOML suite files as raw UTF-8 bytes and preserves that UTF-8 code page when passing text into the Pascal parser. That extra step matters on Windows, where an implicit conversion through the local ANSI code page would otherwise corrupt Unicode keys and string values and produce false TOML compliance failures. + The TOML runner exits non-zero when any case fails or times out, so it is safe to use directly in CI. When `--harness` is omitted it compiles `scripts/GocciaTOMLCheck.dpr` automatically; CI uses a prebuilt harness from the matrix build artifacts instead. ### Run Pascal Unit Tests diff --git a/scripts/GocciaTOMLCheck.dpr b/scripts/GocciaTOMLCheck.dpr index b55e0cde..880875e5 100644 --- a/scripts/GocciaTOMLCheck.dpr +++ b/scripts/GocciaTOMLCheck.dpr @@ -12,15 +12,22 @@ uses Goccia.TOML, Goccia.Values.Primitives; -function LoadUTF8File(const APath: string): UTF8String; +function LoadUTF8File(const APath: string): string; +const + UTF8_CODE_PAGE = 65001; var Stream: TFileStream; + UTF8Text: RawByteString; begin Stream := TFileStream.Create(APath, fmOpenRead or fmShareDenyWrite); try - SetLength(Result, Stream.Size); - if Length(Result) > 0 then - Stream.ReadBuffer(Pointer(Result)^, Length(Result)); + SetLength(UTF8Text, Stream.Size); + if Length(UTF8Text) > 0 then + begin + Stream.ReadBuffer(Pointer(UTF8Text)^, Length(UTF8Text)); + SetCodePage(UTF8Text, UTF8_CODE_PAGE, False); + end; + Result := UTF8Text; finally Stream.Free; end; @@ -134,7 +141,7 @@ var ExitCode: Integer; Parser: TGocciaTOMLParser; Root: TGocciaTOMLNode; - SourceText: UTF8String; + SourceText: string; begin if ParamCount <> 1 then Halt(2); @@ -146,7 +153,7 @@ begin try try SourceText := LoadUTF8File(ParamStr(1)); - Root := Parser.ParseDocument(string(SourceText)); + Root := Parser.ParseDocument(SourceText); try WriteLn(SerializeNode(Root)); ExitCode := 0; diff --git a/units/Goccia.TOML.Test.pas b/units/Goccia.TOML.Test.pas index 093419ae..508024a0 100644 --- a/units/Goccia.TOML.Test.pas +++ b/units/Goccia.TOML.Test.pas @@ -16,6 +16,7 @@ TTOMLParserTests = class(TTestSuite) function GetChildOrFail(const AParent: TGocciaTOMLNode; const AKey: string): TGocciaTOMLNode; procedure TestParseDocumentNormalizesCRLFInMultilineStrings; + procedure TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues; procedure TestParseDocumentTracksScalarKinds; procedure TestParseDocumentTracksArrayElementKinds; public @@ -30,6 +31,8 @@ procedure TTOMLParserTests.SetupTests; TestParseDocumentTracksArrayElementKinds); Test('ParseDocument normalizes CRLF in multiline strings', TestParseDocumentNormalizesCRLFInMultilineStrings); + Test('ParseDocument preserves UTF-8 bytes for Unicode keys and values', + TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues); end; function TTOMLParserTests.GetChildOrFail(const AParent: TGocciaTOMLNode; @@ -150,6 +153,46 @@ procedure TTOMLParserTests.TestParseDocumentNormalizesCRLFInMultilineStrings; end; end; +procedure TTOMLParserTests.TestParseDocumentPreservesUTF8BytesForUnicodeKeysAndValues; +const + UTF8_CODE_PAGE = 65001; +var + Parser: TGocciaTOMLParser; + RawSourceText: RawByteString; + RawUnicodeKey: RawByteString; + RawUnicodeValue: RawByteString; + Root: TGocciaTOMLNode; + SourceText: string; + UnicodeKey: string; + UnicodeNode: TGocciaTOMLNode; + UnicodeValue: string; +begin + RawUnicodeKey := #$CE#$B4; + SetCodePage(RawUnicodeKey, UTF8_CODE_PAGE, False); + UnicodeKey := RawUnicodeKey; + + RawUnicodeValue := 'Jos' + #$C3#$A9; + SetCodePage(RawUnicodeValue, UTF8_CODE_PAGE, False); + UnicodeValue := RawUnicodeValue; + + RawSourceText := '"' + RawUnicodeKey + '" = "' + RawUnicodeValue + '"' + LineEnding; + SetCodePage(RawSourceText, UTF8_CODE_PAGE, False); + SourceText := RawSourceText; + + Parser := TGocciaTOMLParser.Create; + try + Root := Parser.ParseDocument(SourceText); + try + UnicodeNode := GetChildOrFail(Root, UnicodeKey); + Expect(UnicodeNode.CanonicalValue).ToBe(UnicodeValue); + finally + Root.Free; + end; + finally + Parser.Free; + end; +end; + begin TestRunnerProgram.AddSuite(TTOMLParserTests.Create('TOML Parser')); TestRunnerProgram.Run;