Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
hotfix: add utf-8 parsint for emoji_support (#40)
* hotfix: add utf-8 parsint for emoji_support There is a major bug in yaml.v3 that makes it incapable of encoding emojis. There has been an issue and PR open to fix it for a very long time, but it has still gotten no attention. This PR adds a new hotfix that will parse the literal `\U00000000` codepoint sequences that yaml.v3 produces and outputs the unicode encoding. It can be enabled with the new `emoji_support` configuration value for the basic formatter. Unfortunately, any scalar with an emoji will come out double quoted; might be able to rectify this but it will be very nasty. Going to accept that consequence for now to unblock this breakage. * hotfix: add p.writeArbitrary It was weird having p.write and a manual p.out append so I wrote a method for that for consistency.
- Loading branch information
Showing
8 changed files
with
185 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package hotfix | ||
|
||
import ( | ||
"errors" | ||
"regexp" | ||
|
||
"github.com/RageCage64/go-utf8-codepoint-converter/codepoint" | ||
) | ||
|
||
func ParseUnicodePoints(content []byte) []byte { | ||
if len(content) == 0 { | ||
return []byte{} | ||
} | ||
|
||
p := unicodeParser{ | ||
buf: content, | ||
out: []byte{}, | ||
} | ||
|
||
var err error | ||
for err != errEndOfBuffer { | ||
if p.peek() == '\\' { | ||
err = p.parseUTF8CodePoint() | ||
continue | ||
} | ||
|
||
p.write() | ||
err = p.next() | ||
} | ||
|
||
return p.out | ||
} | ||
|
||
type unicodeParser struct { | ||
buf []byte | ||
out []byte | ||
pos int | ||
} | ||
|
||
var ( | ||
errInvalidCodePoint = errors.New("invalid UTF-8 codepoint sequence") | ||
errEndOfBuffer = errors.New("end of buffer") | ||
) | ||
|
||
func (p *unicodeParser) peek() byte { | ||
return p.buf[p.pos] | ||
} | ||
|
||
func (p *unicodeParser) write() { | ||
p.out = append(p.out, p.peek()) | ||
} | ||
|
||
func (p *unicodeParser) writeArbitrary(b []byte) { | ||
p.out = append(p.out, b...) | ||
} | ||
|
||
func (p *unicodeParser) next() error { | ||
p.pos++ | ||
if p.pos == len(p.buf) { | ||
return errEndOfBuffer | ||
} | ||
return nil | ||
} | ||
|
||
func (p *unicodeParser) parseUTF8CodePoint() error { | ||
codepointBytes := []byte{} | ||
|
||
// Parse literal escape tokens while checking if this is a valid UTF-16 sequence | ||
if p.peek() != '\\' { | ||
return errInvalidCodePoint | ||
} | ||
codepointBytes = append(codepointBytes, p.peek()) | ||
err := p.next() | ||
if err != nil { | ||
return err | ||
} | ||
if p.peek() != 'U' { | ||
return errInvalidCodePoint | ||
} | ||
codepointBytes = append(codepointBytes, p.peek()) | ||
|
||
// We've detected a UTF-8 codepoint sequence. The library writes the UTF-16 sequence | ||
// i.e. \U0001F60A as 10 individual bytes. Our goal is to combine the 8 | ||
// hexadecimal numbers we should see subsequently into the 4 byte values they | ||
// represent. | ||
isHex, err := regexp.Compile("[0-9A-F]") | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for i := 0; i < 8; i++ { | ||
// Get a byte and confirm it is a hex digit. | ||
err = p.next() | ||
if err != nil { | ||
return err | ||
} | ||
hexDigit := p.peek() | ||
if !isHex.Match([]byte{hexDigit}) { | ||
return errInvalidCodePoint | ||
} | ||
codepointBytes = append(codepointBytes, hexDigit) | ||
} | ||
|
||
// Now that we have the codepoint, we'll represent it as a string | ||
// and pass it to the codepoint conversion library. | ||
utf8Bytes, err := codepoint.Convert(string(codepointBytes)) | ||
if err != nil { | ||
return err | ||
} | ||
p.writeArbitrary(utf8Bytes) | ||
|
||
// Continue to the next byte for convenience to the caller. | ||
err = p.next() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package hotfix_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/google/yamlfmt/formatters/basic" | ||
"github.com/google/yamlfmt/internal/hotfix" | ||
) | ||
|
||
func TestParseEmoji(t *testing.T) { | ||
testCases := []struct { | ||
name string | ||
yamlStr string | ||
expectedStr string | ||
}{ | ||
{ | ||
name: "parses emoji", | ||
yamlStr: "a: 😂\n", | ||
expectedStr: "a: \"😂\"\n", | ||
}, | ||
{ | ||
name: "parses multiple emoji", | ||
yamlStr: "a: 😼 👑\n", | ||
expectedStr: "a: \"😼 👑\"\n", | ||
}, | ||
} | ||
|
||
f := &basic.BasicFormatter{Config: basic.DefaultConfig()} | ||
for _, tc := range testCases { | ||
t.Run(tc.name, func(t *testing.T) { | ||
formattedBefore, err := f.Format([]byte(tc.yamlStr)) | ||
if err != nil { | ||
t.Fatalf("yaml failed to parse: %v", err) | ||
} | ||
formattedAfter := hotfix.ParseUnicodePoints(formattedBefore) | ||
formattedStr := string(formattedAfter) | ||
if formattedStr != tc.expectedStr { | ||
t.Fatalf("parsed string does not match: \nexpected: %s\ngot: %s", tc.expectedStr, string(formattedStr)) | ||
} | ||
}) | ||
} | ||
} |