hotfix: add utf-8 parsint for emoji_support (#40)

* hotfix: add utf-8 parsint for emoji_support There is a major bug in yaml.v3 that makes it incapable of encoding emojis. There has been an issue and PR open to fix it for a very long time, but it has still gotten no attention. This PR adds a new hotfix that will parse the literal `\U00000000` codepoint sequences that yaml.v3 produces and outputs the unicode encoding. It can be enabled with the new `emoji_support` configuration value for the basic formatter. Unfortunately, any scalar with an emoji will come out double quoted; might be able to rectify this but it will be very nasty. Going to accept that consequence for now to unblock this breakage. * hotfix: add p.writeArbitrary It was weird having p.write and a manual p.out append so I wrote a method for that for consistency.
google · Aug 30, 2022 · d9e8674 · d9e8674
1 parent abc4812
commit d9e8674
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 1 deletion.
diff --git a/formatters/basic/README.md b/formatters/basic/README.md
@@ -9,3 +9,4 @@ The basic formatter is a barebones formatter that simply takes the data provided
 | `indent`                 | int            | 2       | The indentation level in spaces to use for the formatted yaml|
 | `include_document_start` | bool           | false   | Include `---` at document start |
 | `line_ending`            | `lf` or `crlf` | `crlf` on Windows, `lf` otherwise | Parse and write the file with "lf" or "crlf" line endings |
+| `emoji_support`          | bool           | false   | Support encoding utf-8 emojis |
diff --git a/formatters/basic/config.go b/formatters/basic/config.go
@@ -23,6 +23,7 @@ import (
 type Config struct {
 	Indent               int    `mapstructure:"indent"`
 	IncludeDocumentStart bool   `mapstructure:"include_document_start"`
+	EmojiSupport         bool   `mapstructure:"emoji_support"`
 	LineEnding           string `mapstructure:"line_ending"`
 }
 

diff --git a/formatters/basic/formatter.go b/formatters/basic/formatter.go
@@ -71,8 +71,11 @@ func (f *BasicFormatter) Format(yamlContent []byte) ([]byte, error) {
 	if f.Config.IncludeDocumentStart {
 		encodedContent = withDocumentStart(b.Bytes())
 	}
+	if f.Config.EmojiSupport {
+		encodedContent = hotfix.ParseUnicodePoints(encodedContent)
+	}
 	if f.Config.LineEnding == yamlfmt.LineBreakStyleCRLF {
-		return hotfix.WriteCRLFBytes(encodedContent), nil
+		encodedContent = hotfix.WriteCRLFBytes(encodedContent)
 	}
 	return encodedContent, nil
 }

diff --git a/formatters/basic/formatter_test.go b/formatters/basic/formatter_test.go
@@ -98,3 +98,18 @@ func TestCRLFLineEnding(t *testing.T) {
 		t.Fatalf("didn't write CRLF properly in result: %v", result)
 	}
 }
+
+func TestEmojiSupport(t *testing.T) {
+	f := &basic.BasicFormatter{Config: basic.DefaultConfig()}
+	f.Config.EmojiSupport = true
+
+	yaml := "a: 😊"
+	result, err := f.Format([]byte(yaml))
+	if err != nil {
+		t.Fatalf("expected formatting to pass, returned error: %v", err)
+	}
+	resultStr := string(result)
+	if !strings.Contains(resultStr, "😊") {
+		t.Fatalf("expected string to contain 😊, got: %s", resultStr)
+	}
+}
diff --git a/go.mod b/go.mod
@@ -3,6 +3,7 @@ module github.com/google/yamlfmt
 go 1.19
 
 require (
+	github.com/RageCage64/go-utf8-codepoint-converter v0.1.0
 	github.com/bmatcuk/doublestar/v4 v4.2.0
 	github.com/google/go-cmp v0.5.8
 	github.com/mitchellh/mapstructure v1.5.0

diff --git a/go.sum b/go.sum
@@ -1,3 +1,5 @@
+github.com/RageCage64/go-utf8-codepoint-converter v0.1.0 h1:6GreQRSQApXW1sgeFXMBLDdxSC6DCzu4lKBS/LEyrSA=
+github.com/RageCage64/go-utf8-codepoint-converter v0.1.0/go.mod h1:asNWDxR7n0QIQyZNYTlpNk6Dg7GkUnxtCXho987uen8=
 github.com/bmatcuk/doublestar/v4 v4.2.0 h1:Qu+u9wR3Vd89LnlLMHvnZ5coJMWKQamqdz9/p5GNthA=
 github.com/bmatcuk/doublestar/v4 v4.2.0/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
 github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=

diff --git a/internal/hotfix/unicode.go b/internal/hotfix/unicode.go
@@ -0,0 +1,119 @@
+package hotfix
+
+import (
+	"errors"
+	"regexp"
+
+	"github.com/RageCage64/go-utf8-codepoint-converter/codepoint"
+)
+
+func ParseUnicodePoints(content []byte) []byte {
+	if len(content) == 0 {
+		return []byte{}
+	}
+
+	p := unicodeParser{
+		buf: content,
+		out: []byte{},
+	}
+
+	var err error
+	for err != errEndOfBuffer {
+		if p.peek() == '\\' {
+			err = p.parseUTF8CodePoint()
+			continue
+		}
+
+		p.write()
+		err = p.next()
+	}
+
+	return p.out
+}
+
+type unicodeParser struct {
+	buf []byte
+	out []byte
+	pos int
+}
+
+var (
+	errInvalidCodePoint = errors.New("invalid UTF-8 codepoint sequence")
+	errEndOfBuffer      = errors.New("end of buffer")
+)
+
+func (p *unicodeParser) peek() byte {
+	return p.buf[p.pos]
+}
+
+func (p *unicodeParser) write() {
+	p.out = append(p.out, p.peek())
+}
+
+func (p *unicodeParser) writeArbitrary(b []byte) {
+	p.out = append(p.out, b...)
+}
+
+func (p *unicodeParser) next() error {
+	p.pos++
+	if p.pos == len(p.buf) {
+		return errEndOfBuffer
+	}
+	return nil
+}
+
+func (p *unicodeParser) parseUTF8CodePoint() error {
+	codepointBytes := []byte{}
+
+	// Parse literal escape tokens while checking if this is a valid UTF-16 sequence
+	if p.peek() != '\\' {
+		return errInvalidCodePoint
+	}
+	codepointBytes = append(codepointBytes, p.peek())
+	err := p.next()
+	if err != nil {
+		return err
+	}
+	if p.peek() != 'U' {
+		return errInvalidCodePoint
+	}
+	codepointBytes = append(codepointBytes, p.peek())
+
+	// We've detected a UTF-8 codepoint sequence. The library writes the UTF-16 sequence
+	// i.e. \U0001F60A as 10 individual bytes. Our goal is to combine the 8
+	// hexadecimal numbers we should see subsequently into the 4 byte values they
+	// represent.
+	isHex, err := regexp.Compile("[0-9A-F]")
+	if err != nil {
+		return err
+	}
+
+	for i := 0; i < 8; i++ {
+		// Get a byte and confirm it is a hex digit.
+		err = p.next()
+		if err != nil {
+			return err
+		}
+		hexDigit := p.peek()
+		if !isHex.Match([]byte{hexDigit}) {
+			return errInvalidCodePoint
+		}
+		codepointBytes = append(codepointBytes, hexDigit)
+	}
+
+	// Now that we have the codepoint, we'll represent it as a string
+	// and pass it to the codepoint conversion library.
+	utf8Bytes, err := codepoint.Convert(string(codepointBytes))
+	if err != nil {
+		return err
+	}
+	p.writeArbitrary(utf8Bytes)
+
+	// Continue to the next byte for convenience to the caller.
+	err = p.next()
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/internal/hotfix/unicode_test.go b/internal/hotfix/unicode_test.go
@@ -0,0 +1,42 @@
+package hotfix_test
+
+import (
+	"testing"
+
+	"github.com/google/yamlfmt/formatters/basic"
+	"github.com/google/yamlfmt/internal/hotfix"
+)
+
+func TestParseEmoji(t *testing.T) {
+	testCases := []struct {
+		name        string
+		yamlStr     string
+		expectedStr string
+	}{
+		{
+			name:        "parses emoji",
+			yamlStr:     "a: 😂\n",
+			expectedStr: "a: \"😂\"\n",
+		},
+		{
+			name:        "parses multiple emoji",
+			yamlStr:     "a: 😼 👑\n",
+			expectedStr: "a: \"😼 👑\"\n",
+		},
+	}
+
+	f := &basic.BasicFormatter{Config: basic.DefaultConfig()}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			formattedBefore, err := f.Format([]byte(tc.yamlStr))
+			if err != nil {
+				t.Fatalf("yaml failed to parse: %v", err)
+			}
+			formattedAfter := hotfix.ParseUnicodePoints(formattedBefore)
+			formattedStr := string(formattedAfter)
+			if formattedStr != tc.expectedStr {
+				t.Fatalf("parsed string does not match: \nexpected: %s\ngot: %s", tc.expectedStr, string(formattedStr))
+			}
+		})
+	}
+}