diff --git a/formatters/basic/README.md b/formatters/basic/README.md index 62db5a4..ec07961 100644 --- a/formatters/basic/README.md +++ b/formatters/basic/README.md @@ -9,3 +9,4 @@ The basic formatter is a barebones formatter that simply takes the data provided | `indent` | int | 2 | The indentation level in spaces to use for the formatted yaml| | `include_document_start` | bool | false | Include `---` at document start | | `line_ending` | `lf` or `crlf` | `crlf` on Windows, `lf` otherwise | Parse and write the file with "lf" or "crlf" line endings | +| `emoji_support` | bool | false | Support encoding utf-8 emojis | diff --git a/formatters/basic/config.go b/formatters/basic/config.go index 5e267fd..906eae6 100644 --- a/formatters/basic/config.go +++ b/formatters/basic/config.go @@ -23,6 +23,7 @@ import ( type Config struct { Indent int `mapstructure:"indent"` IncludeDocumentStart bool `mapstructure:"include_document_start"` + EmojiSupport bool `mapstructure:"emoji_support"` LineEnding string `mapstructure:"line_ending"` } diff --git a/formatters/basic/formatter.go b/formatters/basic/formatter.go index 01944b8..3aeeaee 100644 --- a/formatters/basic/formatter.go +++ b/formatters/basic/formatter.go @@ -71,8 +71,11 @@ func (f *BasicFormatter) Format(yamlContent []byte) ([]byte, error) { if f.Config.IncludeDocumentStart { encodedContent = withDocumentStart(b.Bytes()) } + if f.Config.EmojiSupport { + encodedContent = hotfix.ParseUnicodePoints(encodedContent) + } if f.Config.LineEnding == yamlfmt.LineBreakStyleCRLF { - return hotfix.WriteCRLFBytes(encodedContent), nil + encodedContent = hotfix.WriteCRLFBytes(encodedContent) } return encodedContent, nil } diff --git a/formatters/basic/formatter_test.go b/formatters/basic/formatter_test.go index a2c1084..c33dc70 100644 --- a/formatters/basic/formatter_test.go +++ b/formatters/basic/formatter_test.go @@ -98,3 +98,18 @@ func TestCRLFLineEnding(t *testing.T) { t.Fatalf("didn't write CRLF properly in result: %v", result) } } + +func TestEmojiSupport(t *testing.T) { + f := &basic.BasicFormatter{Config: basic.DefaultConfig()} + f.Config.EmojiSupport = true + + yaml := "a: 😊" + result, err := f.Format([]byte(yaml)) + if err != nil { + t.Fatalf("expected formatting to pass, returned error: %v", err) + } + resultStr := string(result) + if !strings.Contains(resultStr, "😊") { + t.Fatalf("expected string to contain 😊, got: %s", resultStr) + } +} diff --git a/go.mod b/go.mod index e9892b5..7e3cb70 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/google/yamlfmt go 1.19 require ( + github.com/RageCage64/go-utf8-codepoint-converter v0.1.0 github.com/bmatcuk/doublestar/v4 v4.2.0 github.com/google/go-cmp v0.5.8 github.com/mitchellh/mapstructure v1.5.0 diff --git a/go.sum b/go.sum index 7bcfede..eaaa5c6 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/RageCage64/go-utf8-codepoint-converter v0.1.0 h1:6GreQRSQApXW1sgeFXMBLDdxSC6DCzu4lKBS/LEyrSA= +github.com/RageCage64/go-utf8-codepoint-converter v0.1.0/go.mod h1:asNWDxR7n0QIQyZNYTlpNk6Dg7GkUnxtCXho987uen8= github.com/bmatcuk/doublestar/v4 v4.2.0 h1:Qu+u9wR3Vd89LnlLMHvnZ5coJMWKQamqdz9/p5GNthA= github.com/bmatcuk/doublestar/v4 v4.2.0/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc= github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= diff --git a/internal/hotfix/unicode.go b/internal/hotfix/unicode.go new file mode 100644 index 0000000..1f98e2e --- /dev/null +++ b/internal/hotfix/unicode.go @@ -0,0 +1,119 @@ +package hotfix + +import ( + "errors" + "regexp" + + "github.com/RageCage64/go-utf8-codepoint-converter/codepoint" +) + +func ParseUnicodePoints(content []byte) []byte { + if len(content) == 0 { + return []byte{} + } + + p := unicodeParser{ + buf: content, + out: []byte{}, + } + + var err error + for err != errEndOfBuffer { + if p.peek() == '\\' { + err = p.parseUTF8CodePoint() + continue + } + + p.write() + err = p.next() + } + + return p.out +} + +type unicodeParser struct { + buf []byte + out []byte + pos int +} + +var ( + errInvalidCodePoint = errors.New("invalid UTF-8 codepoint sequence") + errEndOfBuffer = errors.New("end of buffer") +) + +func (p *unicodeParser) peek() byte { + return p.buf[p.pos] +} + +func (p *unicodeParser) write() { + p.out = append(p.out, p.peek()) +} + +func (p *unicodeParser) writeArbitrary(b []byte) { + p.out = append(p.out, b...) +} + +func (p *unicodeParser) next() error { + p.pos++ + if p.pos == len(p.buf) { + return errEndOfBuffer + } + return nil +} + +func (p *unicodeParser) parseUTF8CodePoint() error { + codepointBytes := []byte{} + + // Parse literal escape tokens while checking if this is a valid UTF-16 sequence + if p.peek() != '\\' { + return errInvalidCodePoint + } + codepointBytes = append(codepointBytes, p.peek()) + err := p.next() + if err != nil { + return err + } + if p.peek() != 'U' { + return errInvalidCodePoint + } + codepointBytes = append(codepointBytes, p.peek()) + + // We've detected a UTF-8 codepoint sequence. The library writes the UTF-16 sequence + // i.e. \U0001F60A as 10 individual bytes. Our goal is to combine the 8 + // hexadecimal numbers we should see subsequently into the 4 byte values they + // represent. + isHex, err := regexp.Compile("[0-9A-F]") + if err != nil { + return err + } + + for i := 0; i < 8; i++ { + // Get a byte and confirm it is a hex digit. + err = p.next() + if err != nil { + return err + } + hexDigit := p.peek() + if !isHex.Match([]byte{hexDigit}) { + return errInvalidCodePoint + } + codepointBytes = append(codepointBytes, hexDigit) + } + + // Now that we have the codepoint, we'll represent it as a string + // and pass it to the codepoint conversion library. + utf8Bytes, err := codepoint.Convert(string(codepointBytes)) + if err != nil { + return err + } + p.writeArbitrary(utf8Bytes) + + // Continue to the next byte for convenience to the caller. + err = p.next() + if err != nil { + return err + } + + return nil +} diff --git a/internal/hotfix/unicode_test.go b/internal/hotfix/unicode_test.go new file mode 100644 index 0000000..856ef11 --- /dev/null +++ b/internal/hotfix/unicode_test.go @@ -0,0 +1,42 @@ +package hotfix_test + +import ( + "testing" + + "github.com/google/yamlfmt/formatters/basic" + "github.com/google/yamlfmt/internal/hotfix" +) + +func TestParseEmoji(t *testing.T) { + testCases := []struct { + name string + yamlStr string + expectedStr string + }{ + { + name: "parses emoji", + yamlStr: "a: 😂\n", + expectedStr: "a: \"😂\"\n", + }, + { + name: "parses multiple emoji", + yamlStr: "a: 😼 👑\n", + expectedStr: "a: \"😼 👑\"\n", + }, + } + + f := &basic.BasicFormatter{Config: basic.DefaultConfig()} + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + formattedBefore, err := f.Format([]byte(tc.yamlStr)) + if err != nil { + t.Fatalf("yaml failed to parse: %v", err) + } + formattedAfter := hotfix.ParseUnicodePoints(formattedBefore) + formattedStr := string(formattedAfter) + if formattedStr != tc.expectedStr { + t.Fatalf("parsed string does not match: \nexpected: %s\ngot: %s", tc.expectedStr, string(formattedStr)) + } + }) + } +}