Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
547 lines (507 sloc)
14.6 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2009 The Go Authors. All rights reserved. | |
// Use of this source code is governed by a BSD-style | |
// license that can be found in the LICENSE file. | |
//go:generate go run makeisprint.go -output isprint.go | |
package strconv | |
import ( | |
"internal/bytealg" | |
"unicode/utf8" | |
) | |
const ( | |
lowerhex = "0123456789abcdef" | |
upperhex = "0123456789ABCDEF" | |
) | |
func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { | |
return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) | |
} | |
func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { | |
return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) | |
} | |
func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { | |
// Often called with big strings, so preallocate. If there's quoting, | |
// this is conservative but still helps a lot. | |
if cap(buf)-len(buf) < len(s) { | |
nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) | |
copy(nBuf, buf) | |
buf = nBuf | |
} | |
buf = append(buf, quote) | |
for width := 0; len(s) > 0; s = s[width:] { | |
r := rune(s[0]) | |
width = 1 | |
if r >= utf8.RuneSelf { | |
r, width = utf8.DecodeRuneInString(s) | |
} | |
if width == 1 && r == utf8.RuneError { | |
buf = append(buf, `\x`...) | |
buf = append(buf, lowerhex[s[0]>>4]) | |
buf = append(buf, lowerhex[s[0]&0xF]) | |
continue | |
} | |
buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) | |
} | |
buf = append(buf, quote) | |
return buf | |
} | |
func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { | |
buf = append(buf, quote) | |
if !utf8.ValidRune(r) { | |
r = utf8.RuneError | |
} | |
buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) | |
buf = append(buf, quote) | |
return buf | |
} | |
func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { | |
var runeTmp [utf8.UTFMax]byte | |
if r == rune(quote) || r == '\\' { // always backslashed | |
buf = append(buf, '\\') | |
buf = append(buf, byte(r)) | |
return buf | |
} | |
if ASCIIonly { | |
if r < utf8.RuneSelf && IsPrint(r) { | |
buf = append(buf, byte(r)) | |
return buf | |
} | |
} else if IsPrint(r) || graphicOnly && isInGraphicList(r) { | |
n := utf8.EncodeRune(runeTmp[:], r) | |
buf = append(buf, runeTmp[:n]...) | |
return buf | |
} | |
switch r { | |
case '\a': | |
buf = append(buf, `\a`...) | |
case '\b': | |
buf = append(buf, `\b`...) | |
case '\f': | |
buf = append(buf, `\f`...) | |
case '\n': | |
buf = append(buf, `\n`...) | |
case '\r': | |
buf = append(buf, `\r`...) | |
case '\t': | |
buf = append(buf, `\t`...) | |
case '\v': | |
buf = append(buf, `\v`...) | |
default: | |
switch { | |
case r < ' ': | |
buf = append(buf, `\x`...) | |
buf = append(buf, lowerhex[byte(r)>>4]) | |
buf = append(buf, lowerhex[byte(r)&0xF]) | |
case r > utf8.MaxRune: | |
r = 0xFFFD | |
fallthrough | |
case r < 0x10000: | |
buf = append(buf, `\u`...) | |
for s := 12; s >= 0; s -= 4 { | |
buf = append(buf, lowerhex[r>>uint(s)&0xF]) | |
} | |
default: | |
buf = append(buf, `\U`...) | |
for s := 28; s >= 0; s -= 4 { | |
buf = append(buf, lowerhex[r>>uint(s)&0xF]) | |
} | |
} | |
} | |
return buf | |
} | |
// Quote returns a double-quoted Go string literal representing s. The | |
// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for | |
// control characters and non-printable characters as defined by | |
// IsPrint. | |
func Quote(s string) string { | |
return quoteWith(s, '"', false, false) | |
} | |
// AppendQuote appends a double-quoted Go string literal representing s, | |
// as generated by Quote, to dst and returns the extended buffer. | |
func AppendQuote(dst []byte, s string) []byte { | |
return appendQuotedWith(dst, s, '"', false, false) | |
} | |
// QuoteToASCII returns a double-quoted Go string literal representing s. | |
// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for | |
// non-ASCII characters and non-printable characters as defined by IsPrint. | |
func QuoteToASCII(s string) string { | |
return quoteWith(s, '"', true, false) | |
} | |
// AppendQuoteToASCII appends a double-quoted Go string literal representing s, | |
// as generated by QuoteToASCII, to dst and returns the extended buffer. | |
func AppendQuoteToASCII(dst []byte, s string) []byte { | |
return appendQuotedWith(dst, s, '"', true, false) | |
} | |
// QuoteToGraphic returns a double-quoted Go string literal representing s. | |
// The returned string leaves Unicode graphic characters, as defined by | |
// IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) | |
// for non-graphic characters. | |
func QuoteToGraphic(s string) string { | |
return quoteWith(s, '"', false, true) | |
} | |
// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, | |
// as generated by QuoteToGraphic, to dst and returns the extended buffer. | |
func AppendQuoteToGraphic(dst []byte, s string) []byte { | |
return appendQuotedWith(dst, s, '"', false, true) | |
} | |
// QuoteRune returns a single-quoted Go character literal representing the | |
// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) | |
// for control characters and non-printable characters as defined by IsPrint. | |
func QuoteRune(r rune) string { | |
return quoteRuneWith(r, '\'', false, false) | |
} | |
// AppendQuoteRune appends a single-quoted Go character literal representing the rune, | |
// as generated by QuoteRune, to dst and returns the extended buffer. | |
func AppendQuoteRune(dst []byte, r rune) []byte { | |
return appendQuotedRuneWith(dst, r, '\'', false, false) | |
} | |
// QuoteRuneToASCII returns a single-quoted Go character literal representing | |
// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, | |
// \u0100) for non-ASCII characters and non-printable characters as defined | |
// by IsPrint. | |
func QuoteRuneToASCII(r rune) string { | |
return quoteRuneWith(r, '\'', true, false) | |
} | |
// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, | |
// as generated by QuoteRuneToASCII, to dst and returns the extended buffer. | |
func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { | |
return appendQuotedRuneWith(dst, r, '\'', true, false) | |
} | |
// QuoteRuneToGraphic returns a single-quoted Go character literal representing | |
// the rune. If the rune is not a Unicode graphic character, | |
// as defined by IsGraphic, the returned string will use a Go escape sequence | |
// (\t, \n, \xFF, \u0100). | |
func QuoteRuneToGraphic(r rune) string { | |
return quoteRuneWith(r, '\'', false, true) | |
} | |
// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, | |
// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer. | |
func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { | |
return appendQuotedRuneWith(dst, r, '\'', false, true) | |
} | |
// CanBackquote reports whether the string s can be represented | |
// unchanged as a single-line backquoted string without control | |
// characters other than tab. | |
func CanBackquote(s string) bool { | |
for len(s) > 0 { | |
r, wid := utf8.DecodeRuneInString(s) | |
s = s[wid:] | |
if wid > 1 { | |
if r == '\ufeff' { | |
return false // BOMs are invisible and should not be quoted. | |
} | |
continue // All other multibyte runes are correctly encoded and assumed printable. | |
} | |
if r == utf8.RuneError { | |
return false | |
} | |
if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { | |
return false | |
} | |
} | |
return true | |
} | |
func unhex(b byte) (v rune, ok bool) { | |
c := rune(b) | |
switch { | |
case '0' <= c && c <= '9': | |
return c - '0', true | |
case 'a' <= c && c <= 'f': | |
return c - 'a' + 10, true | |
case 'A' <= c && c <= 'F': | |
return c - 'A' + 10, true | |
} | |
return | |
} | |
// UnquoteChar decodes the first character or byte in the escaped string | |
// or character literal represented by the string s. | |
// It returns four values: | |
// | |
// 1) value, the decoded Unicode code point or byte value; | |
// 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; | |
// 3) tail, the remainder of the string after the character; and | |
// 4) an error that will be nil if the character is syntactically valid. | |
// | |
// The second argument, quote, specifies the type of literal being parsed | |
// and therefore which escaped quote character is permitted. | |
// If set to a single quote, it permits the sequence \' and disallows unescaped '. | |
// If set to a double quote, it permits \" and disallows unescaped ". | |
// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. | |
func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { | |
// easy cases | |
if len(s) == 0 { | |
err = ErrSyntax | |
return | |
} | |
switch c := s[0]; { | |
case c == quote && (quote == '\'' || quote == '"'): | |
err = ErrSyntax | |
return | |
case c >= utf8.RuneSelf: | |
r, size := utf8.DecodeRuneInString(s) | |
return r, true, s[size:], nil | |
case c != '\\': | |
return rune(s[0]), false, s[1:], nil | |
} | |
// hard case: c is backslash | |
if len(s) <= 1 { | |
err = ErrSyntax | |
return | |
} | |
c := s[1] | |
s = s[2:] | |
switch c { | |
case 'a': | |
value = '\a' | |
case 'b': | |
value = '\b' | |
case 'f': | |
value = '\f' | |
case 'n': | |
value = '\n' | |
case 'r': | |
value = '\r' | |
case 't': | |
value = '\t' | |
case 'v': | |
value = '\v' | |
case 'x', 'u', 'U': | |
n := 0 | |
switch c { | |
case 'x': | |
n = 2 | |
case 'u': | |
n = 4 | |
case 'U': | |
n = 8 | |
} | |
var v rune | |
if len(s) < n { | |
err = ErrSyntax | |
return | |
} | |
for j := 0; j < n; j++ { | |
x, ok := unhex(s[j]) | |
if !ok { | |
err = ErrSyntax | |
return | |
} | |
v = v<<4 | x | |
} | |
s = s[n:] | |
if c == 'x' { | |
// single-byte string, possibly not UTF-8 | |
value = v | |
break | |
} | |
if v > utf8.MaxRune { | |
err = ErrSyntax | |
return | |
} | |
value = v | |
multibyte = true | |
case '0', '1', '2', '3', '4', '5', '6', '7': | |
v := rune(c) - '0' | |
if len(s) < 2 { | |
err = ErrSyntax | |
return | |
} | |
for j := 0; j < 2; j++ { // one digit already; two more | |
x := rune(s[j]) - '0' | |
if x < 0 || x > 7 { | |
err = ErrSyntax | |
return | |
} | |
v = (v << 3) | x | |
} | |
s = s[2:] | |
if v > 255 { | |
err = ErrSyntax | |
return | |
} | |
value = v | |
case '\\': | |
value = '\\' | |
case '\'', '"': | |
if c != quote { | |
err = ErrSyntax | |
return | |
} | |
value = rune(c) | |
default: | |
err = ErrSyntax | |
return | |
} | |
tail = s | |
return | |
} | |
// Unquote interprets s as a single-quoted, double-quoted, | |
// or backquoted Go string literal, returning the string value | |
// that s quotes. (If s is single-quoted, it would be a Go | |
// character literal; Unquote returns the corresponding | |
// one-character string.) | |
func Unquote(s string) (string, error) { | |
n := len(s) | |
if n < 2 { | |
return "", ErrSyntax | |
} | |
quote := s[0] | |
if quote != s[n-1] { | |
return "", ErrSyntax | |
} | |
s = s[1 : n-1] | |
if quote == '`' { | |
if contains(s, '`') { | |
return "", ErrSyntax | |
} | |
if contains(s, '\r') { | |
// -1 because we know there is at least one \r to remove. | |
buf := make([]byte, 0, len(s)-1) | |
for i := 0; i < len(s); i++ { | |
if s[i] != '\r' { | |
buf = append(buf, s[i]) | |
} | |
} | |
return string(buf), nil | |
} | |
return s, nil | |
} | |
if quote != '"' && quote != '\'' { | |
return "", ErrSyntax | |
} | |
if contains(s, '\n') { | |
return "", ErrSyntax | |
} | |
// Is it trivial? Avoid allocation. | |
if !contains(s, '\\') && !contains(s, quote) { | |
switch quote { | |
case '"': | |
if utf8.ValidString(s) { | |
return s, nil | |
} | |
case '\'': | |
r, size := utf8.DecodeRuneInString(s) | |
if size == len(s) && (r != utf8.RuneError || size != 1) { | |
return s, nil | |
} | |
} | |
} | |
var runeTmp [utf8.UTFMax]byte | |
buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations. | |
for len(s) > 0 { | |
c, multibyte, ss, err := UnquoteChar(s, quote) | |
if err != nil { | |
return "", err | |
} | |
s = ss | |
if c < utf8.RuneSelf || !multibyte { | |
buf = append(buf, byte(c)) | |
} else { | |
n := utf8.EncodeRune(runeTmp[:], c) | |
buf = append(buf, runeTmp[:n]...) | |
} | |
if quote == '\'' && len(s) != 0 { | |
// single-quoted must be single character | |
return "", ErrSyntax | |
} | |
} | |
return string(buf), nil | |
} | |
// contains reports whether the string contains the byte c. | |
func contains(s string, c byte) bool { | |
return bytealg.IndexByteString(s, c) != -1 | |
} | |
// bsearch16 returns the smallest i such that a[i] >= x. | |
// If there is no such i, bsearch16 returns len(a). | |
func bsearch16(a []uint16, x uint16) int { | |
i, j := 0, len(a) | |
for i < j { | |
h := i + (j-i)/2 | |
if a[h] < x { | |
i = h + 1 | |
} else { | |
j = h | |
} | |
} | |
return i | |
} | |
// bsearch32 returns the smallest i such that a[i] >= x. | |
// If there is no such i, bsearch32 returns len(a). | |
func bsearch32(a []uint32, x uint32) int { | |
i, j := 0, len(a) | |
for i < j { | |
h := i + (j-i)/2 | |
if a[h] < x { | |
i = h + 1 | |
} else { | |
j = h | |
} | |
} | |
return i | |
} | |
// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests | |
// to give the same answer. It allows this package not to depend on unicode, | |
// and therefore not pull in all the Unicode tables. If the linker were better | |
// at tossing unused tables, we could get rid of this implementation. | |
// That would be nice. | |
// IsPrint reports whether the rune is defined as printable by Go, with | |
// the same definition as unicode.IsPrint: letters, numbers, punctuation, | |
// symbols and ASCII space. | |
func IsPrint(r rune) bool { | |
// Fast check for Latin-1 | |
if r <= 0xFF { | |
if 0x20 <= r && r <= 0x7E { | |
// All the ASCII is printable from space through DEL-1. | |
return true | |
} | |
if 0xA1 <= r && r <= 0xFF { | |
// Similarly for ¡ through ÿ... | |
return r != 0xAD // ...except for the bizarre soft hyphen. | |
} | |
return false | |
} | |
// Same algorithm, either on uint16 or uint32 value. | |
// First, find first i such that isPrint[i] >= x. | |
// This is the index of either the start or end of a pair that might span x. | |
// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). | |
// If we find x in a range, make sure x is not in isNotPrint list. | |
if 0 <= r && r < 1<<16 { | |
rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 | |
i := bsearch16(isPrint, rr) | |
if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { | |
return false | |
} | |
j := bsearch16(isNotPrint, rr) | |
return j >= len(isNotPrint) || isNotPrint[j] != rr | |
} | |
rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 | |
i := bsearch32(isPrint, rr) | |
if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { | |
return false | |
} | |
if r >= 0x20000 { | |
return true | |
} | |
r -= 0x10000 | |
j := bsearch16(isNotPrint, uint16(r)) | |
return j >= len(isNotPrint) || isNotPrint[j] != uint16(r) | |
} | |
// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such | |
// characters include letters, marks, numbers, punctuation, symbols, and | |
// spaces, from categories L, M, N, P, S, and Zs. | |
func IsGraphic(r rune) bool { | |
if IsPrint(r) { | |
return true | |
} | |
return isInGraphicList(r) | |
} | |
// isInGraphicList reports whether the rune is in the isGraphic list. This separation | |
// from IsGraphic allows quoteWith to avoid two calls to IsPrint. | |
// Should be called only if IsPrint fails. | |
func isInGraphicList(r rune) bool { | |
// We know r must fit in 16 bits - see makeisprint.go. | |
if r > 0xFFFF { | |
return false | |
} | |
rr := uint16(r) | |
i := bsearch16(isGraphic, rr) | |
return i < len(isGraphic) && rr == isGraphic[i] | |
} |