Skip to content

Commit

Permalink
hotfix: add utf-8 parsint for emoji_support (#40)
Browse files Browse the repository at this point in the history
* hotfix: add utf-8 parsint for emoji_support

There is a major bug in yaml.v3 that makes it incapable of encoding
emojis. There has been an issue and PR open to fix it for a very long
time, but it has still gotten no attention. This PR adds a new hotfix
that will parse the literal `\U00000000` codepoint sequences that yaml.v3
produces and outputs the unicode encoding. It can be enabled with the
new `emoji_support` configuration value for the basic formatter.
Unfortunately, any scalar with an emoji will come out double quoted;
might be able to rectify this but it will be very nasty. Going to accept
that consequence for now to unblock this breakage.

* hotfix: add p.writeArbitrary

It was weird having p.write and a manual p.out append so I wrote a
method for that for consistency.
  • Loading branch information
braydonk committed Aug 30, 2022
1 parent abc4812 commit d9e8674
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 1 deletion.
1 change: 1 addition & 0 deletions formatters/basic/README.md
Expand Up @@ -9,3 +9,4 @@ The basic formatter is a barebones formatter that simply takes the data provided
| `indent` | int | 2 | The indentation level in spaces to use for the formatted yaml|
| `include_document_start` | bool | false | Include `---` at document start |
| `line_ending` | `lf` or `crlf` | `crlf` on Windows, `lf` otherwise | Parse and write the file with "lf" or "crlf" line endings |
| `emoji_support` | bool | false | Support encoding utf-8 emojis |
1 change: 1 addition & 0 deletions formatters/basic/config.go
Expand Up @@ -23,6 +23,7 @@ import (
type Config struct {
Indent int `mapstructure:"indent"`
IncludeDocumentStart bool `mapstructure:"include_document_start"`
EmojiSupport bool `mapstructure:"emoji_support"`
LineEnding string `mapstructure:"line_ending"`
}

Expand Down
5 changes: 4 additions & 1 deletion formatters/basic/formatter.go
Expand Up @@ -71,8 +71,11 @@ func (f *BasicFormatter) Format(yamlContent []byte) ([]byte, error) {
if f.Config.IncludeDocumentStart {
encodedContent = withDocumentStart(b.Bytes())
}
if f.Config.EmojiSupport {
encodedContent = hotfix.ParseUnicodePoints(encodedContent)
}
if f.Config.LineEnding == yamlfmt.LineBreakStyleCRLF {
return hotfix.WriteCRLFBytes(encodedContent), nil
encodedContent = hotfix.WriteCRLFBytes(encodedContent)
}
return encodedContent, nil
}
Expand Down
15 changes: 15 additions & 0 deletions formatters/basic/formatter_test.go
Expand Up @@ -98,3 +98,18 @@ func TestCRLFLineEnding(t *testing.T) {
t.Fatalf("didn't write CRLF properly in result: %v", result)
}
}

func TestEmojiSupport(t *testing.T) {
f := &basic.BasicFormatter{Config: basic.DefaultConfig()}
f.Config.EmojiSupport = true

yaml := "a: 😊"
result, err := f.Format([]byte(yaml))
if err != nil {
t.Fatalf("expected formatting to pass, returned error: %v", err)
}
resultStr := string(result)
if !strings.Contains(resultStr, "😊") {
t.Fatalf("expected string to contain 😊, got: %s", resultStr)
}
}
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -3,6 +3,7 @@ module github.com/google/yamlfmt
go 1.19

require (
github.com/RageCage64/go-utf8-codepoint-converter v0.1.0
github.com/bmatcuk/doublestar/v4 v4.2.0
github.com/google/go-cmp v0.5.8
github.com/mitchellh/mapstructure v1.5.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
@@ -1,3 +1,5 @@
github.com/RageCage64/go-utf8-codepoint-converter v0.1.0 h1:6GreQRSQApXW1sgeFXMBLDdxSC6DCzu4lKBS/LEyrSA=
github.com/RageCage64/go-utf8-codepoint-converter v0.1.0/go.mod h1:asNWDxR7n0QIQyZNYTlpNk6Dg7GkUnxtCXho987uen8=
github.com/bmatcuk/doublestar/v4 v4.2.0 h1:Qu+u9wR3Vd89LnlLMHvnZ5coJMWKQamqdz9/p5GNthA=
github.com/bmatcuk/doublestar/v4 v4.2.0/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=
Expand Down
119 changes: 119 additions & 0 deletions internal/hotfix/unicode.go
@@ -0,0 +1,119 @@
package hotfix

import (
"errors"
"regexp"

"github.com/RageCage64/go-utf8-codepoint-converter/codepoint"
)

func ParseUnicodePoints(content []byte) []byte {
if len(content) == 0 {
return []byte{}
}

p := unicodeParser{
buf: content,
out: []byte{},
}

var err error
for err != errEndOfBuffer {
if p.peek() == '\\' {
err = p.parseUTF8CodePoint()
continue
}

p.write()
err = p.next()
}

return p.out
}

type unicodeParser struct {
buf []byte
out []byte
pos int
}

var (
errInvalidCodePoint = errors.New("invalid UTF-8 codepoint sequence")
errEndOfBuffer = errors.New("end of buffer")
)

func (p *unicodeParser) peek() byte {
return p.buf[p.pos]
}

func (p *unicodeParser) write() {
p.out = append(p.out, p.peek())
}

func (p *unicodeParser) writeArbitrary(b []byte) {
p.out = append(p.out, b...)
}

func (p *unicodeParser) next() error {
p.pos++
if p.pos == len(p.buf) {
return errEndOfBuffer
}
return nil
}

func (p *unicodeParser) parseUTF8CodePoint() error {
codepointBytes := []byte{}

// Parse literal escape tokens while checking if this is a valid UTF-16 sequence
if p.peek() != '\\' {
return errInvalidCodePoint
}
codepointBytes = append(codepointBytes, p.peek())
err := p.next()
if err != nil {
return err
}
if p.peek() != 'U' {
return errInvalidCodePoint
}
codepointBytes = append(codepointBytes, p.peek())

// We've detected a UTF-8 codepoint sequence. The library writes the UTF-16 sequence
// i.e. \U0001F60A as 10 individual bytes. Our goal is to combine the 8
// hexadecimal numbers we should see subsequently into the 4 byte values they
// represent.
isHex, err := regexp.Compile("[0-9A-F]")
if err != nil {
return err
}

for i := 0; i < 8; i++ {
// Get a byte and confirm it is a hex digit.
err = p.next()
if err != nil {
return err
}
hexDigit := p.peek()
if !isHex.Match([]byte{hexDigit}) {
return errInvalidCodePoint
}
codepointBytes = append(codepointBytes, hexDigit)
}

// Now that we have the codepoint, we'll represent it as a string
// and pass it to the codepoint conversion library.
utf8Bytes, err := codepoint.Convert(string(codepointBytes))
if err != nil {
return err
}
p.writeArbitrary(utf8Bytes)

// Continue to the next byte for convenience to the caller.
err = p.next()
if err != nil {
return err
}

return nil
}
42 changes: 42 additions & 0 deletions internal/hotfix/unicode_test.go
@@ -0,0 +1,42 @@
package hotfix_test

import (
"testing"

"github.com/google/yamlfmt/formatters/basic"
"github.com/google/yamlfmt/internal/hotfix"
)

func TestParseEmoji(t *testing.T) {
testCases := []struct {
name string
yamlStr string
expectedStr string
}{
{
name: "parses emoji",
yamlStr: "a: 😂\n",
expectedStr: "a: \"😂\"\n",
},
{
name: "parses multiple emoji",
yamlStr: "a: 😼 👑\n",
expectedStr: "a: \"😼 👑\"\n",
},
}

f := &basic.BasicFormatter{Config: basic.DefaultConfig()}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
formattedBefore, err := f.Format([]byte(tc.yamlStr))
if err != nil {
t.Fatalf("yaml failed to parse: %v", err)
}
formattedAfter := hotfix.ParseUnicodePoints(formattedBefore)
formattedStr := string(formattedAfter)
if formattedStr != tc.expectedStr {
t.Fatalf("parsed string does not match: \nexpected: %s\ngot: %s", tc.expectedStr, string(formattedStr))
}
})
}
}

0 comments on commit d9e8674

Please sign in to comment.