Skip to content

Commit

Permalink
Merge pull request #9 from tdakkota/feat/improve-fast-str-encoding
Browse files Browse the repository at this point in the history
feat: improve string encoding
  • Loading branch information
ernado committed Jan 12, 2022
2 parents d76a769 + 155802f commit e59755e
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 159 deletions.
202 changes: 57 additions & 145 deletions enc_str.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,111 +110,6 @@ var htmlSafeSet = [utf8.RuneSelf]bool{
'\u007f': true,
}

// safeSet holds the value true if the ASCII character with the given array
// position can be represented inside a JSON string without any further
// escaping.
//
// All values are true except for the ASCII control characters (0-31), the
// double quote ("), and the backslash character ("\").
var safeSet = [utf8.RuneSelf]bool{
' ': true,
'!': true,
'"': false,
'#': true,
'$': true,
'%': true,
'&': true,
'\'': true,
'(': true,
')': true,
'*': true,
'+': true,
',': true,
'-': true,
'.': true,
'/': true,
'0': true,
'1': true,
'2': true,
'3': true,
'4': true,
'5': true,
'6': true,
'7': true,
'8': true,
'9': true,
':': true,
';': true,
'<': true,
'=': true,
'>': true,
'?': true,
'@': true,
'A': true,
'B': true,
'C': true,
'D': true,
'E': true,
'F': true,
'G': true,
'H': true,
'I': true,
'J': true,
'K': true,
'L': true,
'M': true,
'N': true,
'O': true,
'P': true,
'Q': true,
'R': true,
'S': true,
'T': true,
'U': true,
'V': true,
'W': true,
'X': true,
'Y': true,
'Z': true,
'[': true,
'\\': false,
']': true,
'^': true,
'_': true,
'`': true,
'a': true,
'b': true,
'c': true,
'd': true,
'e': true,
'f': true,
'g': true,
'h': true,
'i': true,
'j': true,
'k': true,
'l': true,
'm': true,
'n': true,
'o': true,
'p': true,
'q': true,
'r': true,
's': true,
't': true,
'u': true,
'v': true,
'w': true,
'x': true,
'y': true,
'z': true,
'{': true,
'|': true,
'}': true,
'~': true,
'\u007f': true,
}

const hexChars = "0123456789abcdef"

// StrEscape encodes string with html special characters escaping.
Expand Down Expand Up @@ -308,66 +203,83 @@ func (e *Encoder) strEscape(i int, v string, valLen int) {
e.byte('"')
}

// safeSet holds the value true if the ASCII character with the given array
// position can be represented inside a JSON string without any further
// escaping.
//
// All values are true except for the ASCII control characters (0-31), the
// double quote ("), and the backslash character ("\").
var safeSet = [256]byte{
// First 31 characters.
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
'"': 1,
'\\': 1,
}

// Str encodes string without html escaping.
//
// Use StrEscape to escape html, this is default for encoding/json and
// should be used by default for untrusted strings.
func (e *Encoder) Str(v string) {
e.comma()
length := len(v)
e.buf = append(e.buf, '"')

// Fast path, without utf8 and escape support.
i := 0
for ; i < length; i++ {
c := v[i]
if c > 31 && c != '"' && c != '\\' && c < utf8.RuneSelf {
e.buf = append(e.buf, c)
} else {
break
var (
i = 0
length = len(v)
c byte
)
for i, c = range []byte(v) {
if safeSet[c] != 0 {
goto slow
}
}
if i == length {
if i == length-1 {
e.buf = append(e.buf, v...)
e.buf = append(e.buf, '"')
return
}
e.strSlow(i, v, length)
slow:
e.buf = append(e.buf, v[:i]...)
e.strSlow(v[i:])
}

func (e *Encoder) strSlow(i int, v string, length int) {
start := i
func (e *Encoder) strSlow(v string) {
var i, start int
// for the remaining parts, we process them char by char
for i < length {
if b := v[i]; b < utf8.RuneSelf {
if safeSet[b] {
i++
continue
}
if start < i {
e.rawStr(v[start:i])
}
switch b {
case '\\', '"':
e.twoBytes('\\', b)
case '\n':
e.twoBytes('\\', 'n')
case '\r':
e.twoBytes('\\', 'r')
case '\t':
e.twoBytes('\\', 't')
default:
// This encodes bytes < 0x20 except for \t, \n and \r.
// If escapeHTML is set, it also escapes <, >, and &
// because they can lead to security holes when
// user-controlled strings are rendered into JSON
// and served to some browsers.
e.rawStr(`\u00`)
e.twoBytes(hexChars[b>>4], hexChars[b&0xF])
}
for i < len(v) {
b := v[i]
if safeSet[b] == 0 {
i++
start = i
continue
}
if start < i {
e.rawStr(v[start:i])
}
switch b {
case '\\', '"':
e.twoBytes('\\', b)
case '\n':
e.twoBytes('\\', 'n')
case '\r':
e.twoBytes('\\', 'r')
case '\t':
e.twoBytes('\\', 't')
default:
// This encodes bytes < 0x20 except for \t, \n and \r.
// If escapeHTML is set, it also escapes <, >, and &
// because they can lead to security holes when
// user-controlled strings are rendered into JSON
// and served to some browsers.
e.rawStr(`\u00`)
e.twoBytes(hexChars[b>>4], hexChars[b&0xF])
}
i++
start = i
continue
}
if start < len(v) {
Expand Down
39 changes: 25 additions & 14 deletions enc_str_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,32 @@ func TestEncoder_StringEscape(t *testing.T) {
}

func TestEncoder_String(t *testing.T) {
t.Run("Escape", func(t *testing.T) {
e := GetEncoder()
const data = `\nH\tel\tl\ro\\World\r` + "\n\rHello\r\tHi"
e.Str(data)
const expected = `"\\nH\\tel\\tl\\ro\\\\World\\r\n\rHello\r\tHi"`
require.Equal(t, expected, string(e.Bytes()))
requireCompat(t, e.Bytes(), data)
t.Run("Decode", func(t *testing.T) {
i := GetDecoder()
i.ResetBytes(e.Bytes())
s, err := i.Str()
require.NoError(t, err)
require.Equal(t, data, s)
testCases := []struct {
name, input, expect string
}{
{`Empty`, ``, `""`},
{`Simple`, `abcd`, `"abcd"`},
{
`Escape`,
`abcd\nH\tel\tl\ro\\World\r` + "\n\rHello\r\tHi",
`"abcd\\nH\\tel\\tl\\ro\\\\World\\r\n\rHello\r\tHi"`,
},
}
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
e := GetEncoder()
e.Str(tt.input)
require.Equal(t, tt.expect, string(e.Bytes()))
requireCompat(t, e.Bytes(), tt.input)
t.Run("Decode", func(t *testing.T) {
i := GetDecoder()
i.ResetBytes(e.Bytes())
s, err := i.Str()
require.NoError(t, err)
require.Equal(t, tt.input, s)
})
})
})
}
t.Run("StrEscapeFast", func(t *testing.T) {
e := GetEncoder()
e.StrEscape("Foo")
Expand Down

0 comments on commit e59755e

Please sign in to comment.