Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unicode/utf16: some performance optimization #6957

Closed
gopherbot opened this issue Dec 15, 2013 · 9 comments
Closed

unicode/utf16: some performance optimization #6957

gopherbot opened this issue Dec 15, 2013 · 9 comments
Labels
FrozenDueToAge Performance Suggested
Milestone

Comments

@gopherbot
Copy link

@gopherbot gopherbot commented Dec 15, 2013

by chanxuehong:

for performance in normal case
@gopherbot
Copy link
Author

@gopherbot gopherbot commented Dec 15, 2013

Comment 1 by chanxuehong:

// Copyright 2010 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf16 implements encoding and decoding of UTF-16 sequences.
package utf16
// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.
const (
    replacementChar = '\uFFFD'     // Unicode replacement character
    maxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
)
const (
    // 0xd800-0xdc00 encodes the high 10 bits of a pair.
    // 0xdc00-0xe000 encodes the low 10 bits of a pair.
    // the value is those 20 bits plus 0x10000.
    surr1 = 0xd800
    surr2 = 0xdc00
    surr3 = 0xe000
    surrSelf = 0x10000
)
// IsSurrogate returns true if the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(r rune) bool {
    return surr1 <= r && r < surr3
}
// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
    if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
        return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
    }
    return replacementChar
}
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(r rune) (r1, r2 rune) {
    if r < surrSelf || r > maxRune {
        return replacementChar, replacementChar
    }
    r -= surrSelf
    return surr1 + r>>10, surr2 + r&0x3ff
}
// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
    n := len(s)
    for _, v := range s {
        if v >= surrSelf {
            n++
        }
    }
    a := make([]uint16, n)
    n = 0
    var r1, r2 rune
    for _, v := range s {
        switch {
        case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
            a[n] = uint16(v)
            n++
        case surrSelf <= v && v <= maxRune:
            r1, r2 = EncodeRune(v)
            a[n] = uint16(r1)
            a[n+1] = uint16(r2)
            n += 2
        default:
            a[n] = uint16(replacementChar)
            n++
        }
    }
    return a[:n]
}
// Decode returns the Unicode code point sequence represented
// by the UTF-16 encoding s.
func Decode(s []uint16) []rune {
    sLen := len(s)
    a := make([]rune, sLen)
    n := 0
    var r uint16
    for i := 0; i < sLen; i++ {
        switch r = s[i]; {
        case r < surr1, surr3 <= r:
            // normal rune
            a[n] = rune(r)
            n++
        case surr1 <= r && r < surr2 && i+1 < sLen &&
            surr2 <= s[i+1] && s[i+1] < surr3:
            // valid surrogate sequence
            a[n] = DecodeRune(rune(r), rune(s[i+1]))
            i++
            n++
        default:
            // invalid surrogate sequence
            a[n] = replacementChar
            n++
        }
    }
    return a[:n]
}

Attachments:

  1. utf16.go (2832 bytes)

@minux
Copy link
Member

@minux minux commented Dec 15, 2013

Comment 2:

for the interested, the diff is:
--- pkg/unicode/utf16/utf16.go  2013-08-18 14:53:21.000000000 -0400
+++ ~/Downloads/utf16.go    2013-12-15 16:57:24.000000000 -0500
@@ -36,7 +36,7 @@
 // the Unicode replacement code point U+FFFD.
 func DecodeRune(r1, r2 rune) rune {
    if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
-       return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + 0x10000
+       return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
    }
    return replacementChar
 }
@@ -45,11 +45,11 @@
 // If the rune is not a valid Unicode code point or does not need encoding,
 // EncodeRune returns U+FFFD, U+FFFD.
 func EncodeRune(r rune) (r1, r2 rune) {
-   if r < surrSelf || r > maxRune || IsSurrogate(r) {
+   if r < surrSelf || r > maxRune {
        return replacementChar, replacementChar
    }
    r -= surrSelf
-   return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
+   return surr1 + r>>10, surr2 + r&0x3ff
 }
 
 // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
@@ -63,46 +63,49 @@
 
    a := make([]uint16, n)
    n = 0
+   var r1, r2 rune
    for _, v := range s {
        switch {
-       case v < 0, surr1 <= v && v < surr3, v > maxRune:
-           v = replacementChar
-           fallthrough
-       case v < surrSelf:
+       case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
            a[n] = uint16(v)
            n++
-       default:
-           r1, r2 := EncodeRune(v)
+       case surrSelf <= v && v <= maxRune:
+           r1, r2 = EncodeRune(v)
            a[n] = uint16(r1)
            a[n+1] = uint16(r2)
            n += 2
+       default:
+           a[n] = uint16(replacementChar)
+           n++
        }
    }
-   return a[0:n]
+   return a[:n]
 }
 
 // Decode returns the Unicode code point sequence represented
 // by the UTF-16 encoding s.
 func Decode(s []uint16) []rune {
-   a := make([]rune, len(s))
+   sLen := len(s)
+   a := make([]rune, sLen)
    n := 0
-   for i := 0; i < len(s); i++ {
-       switch r := s[i]; {
-       case surr1 <= r && r < surr2 && i+1 < len(s) &&
+   var r uint16
+   for i := 0; i < sLen; i++ {
+       switch r = s[i]; {
+       case r < surr1, surr3 <= r:
+           // normal rune
+           a[n] = rune(r)
+           n++
+       case surr1 <= r && r < surr2 && i+1 < sLen &&
            surr2 <= s[i+1] && s[i+1] < surr3:
            // valid surrogate sequence
            a[n] = DecodeRune(rune(r), rune(s[i+1]))
            i++
            n++
-       case surr1 <= r && r < surr3:
+       default:
            // invalid surrogate sequence
            a[n] = replacementChar
            n++
-       default:
-           // normal rune
-           a[n] = rune(r)
-           n++
        }
    }
-   return a[0:n]
+   return a[:n]
 }

@minux
Copy link
Member

@minux minux commented Dec 15, 2013

Comment 3:

If you want to suggest performance optimization to unicode/utf16, please
follow the steps here: http://golang.org/doc/contribute.html and send CL
to golang-dev.
I suggest you also add benchmarks to unicode/utf16 and demonstrate the
performance differences in your CL description (use $GOROOT/misc/benchcmp
to compare output of "go test -bench=. unicode/utf16")

Labels changed: added release-none, repo-main, performance.

@alexbrainman
Copy link
Member

@alexbrainman alexbrainman commented Dec 15, 2013

Comment 4:

This is very old code. I wouldn't be surprised.
Alex

@gopherbot
Copy link
Author

@gopherbot gopherbot commented Dec 16, 2013

Comment 5 by chanxuehong:

thanks all of you, my English sucks, for here the process is not very
understanding, can only do so much of that is,
If you think I submitted a bit useful, please help me to submit and
communication, thank you. We all want golang better.
Also, my submit not modify the bug,  but remove unnecessary statement,
modify the judgment order of switch case.
// Copyright 2010 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf16 implements encoding and decoding of UTF-16 sequences.
package utf16
// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.
const (
    replacementChar = '\uFFFD'     // Unicode replacement character
    maxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
)
const (
    // 0xd800-0xdc00 encodes the high 10 bits of a pair.
    // 0xdc00-0xe000 encodes the low 10 bits of a pair.
    // the value is those 20 bits plus 0x10000.
    surr1 = 0xd800
    surr2 = 0xdc00
    surr3 = 0xe000
    surrSelf = 0x10000
)
// IsSurrogate returns true if the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(r rune) bool {
    return surr1 <= r && r < surr3
}
// decodeRune returns the UTF-16 decoding of a surrogate pair.
// the pair r1, r2 is a valid UTF-16 surrogate pair.
func decodeRune(r1, r2 rune) rune {
    return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
}
// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
    if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
        return decodeRune(r1, r2)
    }
    return replacementChar
}
// encodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// surrSelf <= r && r <= maxRune
func encodeRune(r rune) (r1, r2 rune) {
    r -= surrSelf
    return surr1 + r>>10, surr2 + r&0x3ff
}
// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(r rune) (r1, r2 rune) {
    if r < surrSelf || r > maxRune {
        return replacementChar, replacementChar
    }
    return encodeRune(r)
}
// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
    n := len(s)
    for _, v := range s {
        if v >= surrSelf {
            n++
        }
    }
    a := make([]uint16, n)
    n = 0
    var r1, r2 rune
    for _, v := range s {
        switch {
        case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
            a[n] = uint16(v)
            n++
        case surrSelf <= v && v <= maxRune:
            r1, r2 = encodeRune(v)
            a[n] = uint16(r1)
            a[n+1] = uint16(r2)
            n += 2
        default:
            a[n] = uint16(replacementChar)
            n++
        }
    }
    return a[:n]
}
// Decode returns the Unicode code point sequence represented
// by the UTF-16 encoding s.
func Decode(s []uint16) []rune {
    sLen := len(s)
    a := make([]rune, sLen)
    n := 0
    var r uint16
    for i := 0; i < sLen; i++ {
        switch r = s[i]; {
        case r < surr1, surr3 <= r:
            // normal rune
            a[n] = rune(r)
            n++
        case surr1 <= r && r < surr2 && i+1 < sLen &&
            surr2 <= s[i+1] && s[i+1] < surr3:
            // valid surrogate sequence
            a[n] = decodeRune(rune(r), rune(s[i+1]))
            i++
            n++
        default:
            // invalid surrogate sequence
            a[n] = replacementChar
            n++
        }
    }
    return a[:n]
}

@davecheney
Copy link
Contributor

@davecheney davecheney commented Feb 4, 2014

Comment 6:

Marking as accepted, but please note this has not been assigned to a release and may not
be worked on for 1.3 or later.
@chanxuehong please consider following the contribution guidelines and submitting a
proposal. I think this improvement is valid, but we cannot for reasons of policy accept
patches except via the Rietveld system.

Labels changed: added suggested.

Status changed to Accepted.

@gopherbot gopherbot added accepted Performance Suggested labels Feb 4, 2014
@rsc rsc added this to the Unplanned milestone Apr 10, 2015
@gopherbot
Copy link
Author

@gopherbot gopherbot commented Feb 22, 2016

CL https://golang.org/cl/19820 mentions this issue.

gopherbot pushed a commit that referenced this issue Feb 23, 2016
For #6957

Change-Id: Ic497c12f33efc933e9fe81f6cd1b2a0a01abbabf
Reviewed-on: https://go-review.googlesource.com/19820
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
@gopherbot
Copy link
Author

@gopherbot gopherbot commented Feb 23, 2016

CL https://golang.org/cl/19826 mentions this issue.

gopherbot pushed a commit that referenced this issue Feb 23, 2016
name                        old time/op  new time/op  delta
DecodeValidASCII-4          94.7ns ± 1%  87.4ns ± 1%  -7.71%  (p=0.000 n=10+9)
DecodeValidJapaneseChars-4  91.0ns ± 2%  84.8ns ± 0%  -6.77%  (p=0.000 n=9+10)
DecodeRune-4                16.5ns ± 0%  16.6ns ± 2%    ~     (p=0.108 n=9+10)

For #6957

Change-Id: I618c15c2a42ef7ec6a5cd163b7c3f1a65ca4ad01
Reviewed-on: https://go-review.googlesource.com/19826
Reviewed-by: Rob Pike <r@golang.org>
@gopherbot
Copy link
Author

@gopherbot gopherbot commented Feb 24, 2016

CL https://golang.org/cl/19891 mentions this issue.

@golang golang locked and limited conversation to collaborators Feb 28, 2017
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
FrozenDueToAge Performance Suggested
Projects
None yet
Development

No branches or pull requests

5 participants