Permalink
Browse files

compress/flate: eliminate most common bounds checks

This uses the SSA compiler to eliminate various unneeded bounds
checks in loops and various lookups.

This fixes the low hanging fruit, without any major code changes.

name                       old time/op    new time/op    delta
EncodeDigitsHuffman1e4-8     49.9µs ± 1%    48.1µs ± 1%  -3.74%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8      476µs ± 1%     458µs ± 1%  -3.58%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8     4.80ms ± 2%    4.56ms ± 1%  -5.07%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8        305µs ± 3%     290µs ± 2%  -5.03%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8       3.67ms ± 2%    3.49ms ± 2%  -4.78%   (p=0.000 n=9+10)
EncodeDigitsSpeed1e6-8       38.3ms ± 2%    35.8ms ± 1%  -6.58%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8      361µs ± 2%     346µs ± 3%  -4.12%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8     5.24ms ± 2%    4.96ms ± 3%  -5.38%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8     56.5ms ± 3%    52.2ms ± 2%  -7.68%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8     362µs ± 2%     343µs ± 1%  -5.20%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8    5.26ms ± 3%    4.98ms ± 2%  -5.48%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8    56.0ms ± 4%    52.1ms ± 1%  -7.01%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8      70.9µs ± 3%    64.7µs ± 1%  -8.68%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8       556µs ± 2%     524µs ± 2%  -5.84%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8      5.54ms ± 3%    5.22ms ± 2%  -5.70%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8         294µs ± 3%     284µs ± 1%  -3.71%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8        2.59ms ± 2%    2.48ms ± 1%  -4.14%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8        25.6ms ± 1%    24.3ms ± 1%  -5.28%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8       419µs ± 2%     396µs ± 1%  -5.59%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8      6.23ms ± 4%    5.75ms ± 1%  -7.83%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8      66.2ms ± 2%    61.4ms ± 1%  -7.22%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8      426µs ± 1%     405µs ± 1%  -4.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8     6.80ms ± 1%    6.32ms ± 1%  -6.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8     74.6ms ± 3%    68.7ms ± 1%  -7.90%   (p=0.000 n=10+9)

name                       old speed      new speed      delta
EncodeDigitsHuffman1e4-8    200MB/s ± 1%   208MB/s ± 1%  +3.88%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8    210MB/s ± 1%   218MB/s ± 1%  +3.71%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8    208MB/s ± 2%   219MB/s ± 1%  +5.32%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8     32.8MB/s ± 3%  34.5MB/s ± 2%  +5.29%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8     27.2MB/s ± 2%  28.6MB/s ± 2%  +5.29%  (p=0.000 n=10+10)
EncodeDigitsSpeed1e6-8     26.1MB/s ± 2%  27.9MB/s ± 1%  +7.02%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8   27.7MB/s ± 2%  28.9MB/s ± 3%  +4.30%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8   19.1MB/s ± 2%  20.2MB/s ± 3%  +5.69%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8   17.7MB/s ± 3%  19.2MB/s ± 2%  +8.31%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8  27.6MB/s ± 2%  29.1MB/s ± 1%  +5.47%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8  19.0MB/s ± 3%  20.1MB/s ± 2%  +5.78%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8  17.9MB/s ± 4%  19.2MB/s ± 1%  +7.50%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8     141MB/s ± 3%   154MB/s ± 1%  +9.46%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8     180MB/s ± 2%   191MB/s ± 2%  +6.19%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8     181MB/s ± 3%   192MB/s ± 2%  +6.02%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8      34.0MB/s ± 3%  35.3MB/s ± 1%  +3.84%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8      38.7MB/s ± 2%  40.3MB/s ± 1%  +4.30%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8      39.1MB/s ± 1%  41.2MB/s ± 1%  +5.57%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8    23.9MB/s ± 2%  25.3MB/s ± 1%  +5.91%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8    16.0MB/s ± 4%  17.4MB/s ± 1%  +8.47%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8    15.1MB/s ± 2%  16.3MB/s ± 1%  +7.76%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8   23.5MB/s ± 1%  24.7MB/s ± 1%  +5.24%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8   14.7MB/s ± 1%  15.8MB/s ± 1%  +7.50%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8   13.4MB/s ± 3%  14.6MB/s ± 1%  +8.57%   (p=0.000 n=10+9)

Change-Id: I5c7e84c2f9ea4d38a2115995705eebb93387e22f
Reviewed-on: https://go-review.googlesource.com/21759
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
  • Loading branch information...
klauspost authored and bradfitz committed Apr 10, 2016
1 parent 012557b commit f20b1809f213c662932106a68c76ea3545eab1ee
Showing with 48 additions and 48 deletions.
  1. +17 −20 src/compress/flate/deflate.go
  2. +29 −26 src/compress/flate/huffman_bit_writer.go
  3. +2 −2 src/compress/flate/huffman_code.go
@@ -73,8 +73,8 @@ type compressor struct {
// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
// with the same hash value.
chainHead int
hashHead []uint32
hashPrev []uint32
hashHead [hashSize]uint32
hashPrev [windowSize]uint32
hashOffset int
// input window: unprocessed data is window[index:windowEnd]
@@ -188,12 +188,13 @@ func (d *compressor) fillWindow(b []byte) {
var newH uint32
for i, val := range dst {
di := i + index
newH = val & hashMask
newH = val
hh := &d.hashHead[newH&hashMask]
// Get previous value with the same hash.
// Our chain should point to the previous value.
d.hashPrev[di&windowMask] = d.hashHead[newH]
d.hashPrev[di&windowMask] = *hh
// Set the head of the hash chain to us.
d.hashHead[newH] = uint32(di + d.hashOffset)
*hh = uint32(di + d.hashOffset)
}
d.hash = newH
}
@@ -293,6 +294,7 @@ func bulkHash4(b []byte, dst []uint32) {
// bytes in size.
func matchLen(a, b []byte, max int) int {
a = a[:max]
b = b[:len(a)]
for i, av := range a {
if b[i] != av {
return i
@@ -302,8 +304,6 @@ func matchLen(a, b []byte, max int) int {
}
func (d *compressor) initDeflate() {
d.hashHead = make([]uint32, hashSize)
d.hashPrev = make([]uint32, windowSize)
d.window = make([]byte, 2*windowSize)
d.hashOffset = 1
d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -358,9 +358,10 @@ Loop:
if d.index < d.maxInsertIndex {
// Update the hash
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
d.chainHead = int(d.hashHead[d.hash])
hh := &d.hashHead[d.hash&hashMask]
d.chainHead = int(*hh)
d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
*hh = uint32(d.index + d.hashOffset)
}
prevLength := d.length
prevOffset := d.offset
@@ -404,9 +405,10 @@ Loop:
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
// Get previous value with the same hash.
// Our chain should point to the previous value.
d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
hh := &d.hashHead[d.hash&hashMask]
d.hashPrev[d.index&windowMask] = *hh
// Set the head of the hash chain to us.
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
*hh = uint32(d.index + d.hashOffset)
}
}
if d.fastSkipHashing == skipNever {
@@ -531,9 +533,6 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
return nil
}
// hzeroes is used for zeroing the hash slice.
var hzeroes [256]uint32
func (d *compressor) reset(w io.Writer) {
d.w.reset(w)
d.sync = false
@@ -543,15 +542,13 @@ func (d *compressor) reset(w io.Writer) {
d.windowEnd = 0
default:
d.chainHead = -1
for s := d.hashHead; len(s) > 0; {
n := copy(s, hzeroes[:])
s = s[n:]
for i := range d.hashHead {
d.hashHead[i] = 0
}
for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
copy(s, hzeroes[:])
for i := range d.hashPrev {
d.hashPrev[i] = 0
}
d.hashOffset = 1
d.index, d.windowEnd = 0, 0
d.blockStart, d.byteAvailable = 0, false
d.tokens = d.tokens[:0]
@@ -84,11 +84,11 @@ type huffmanBitWriter struct {
bits uint64
nbits uint
bytes [bufferSize]byte
codegenFreq [codegenCodeCount]int32
nbytes int
literalFreq []int32
offsetFreq []int32
codegen []uint8
codegenFreq []int32
literalEncoding *huffmanEncoder
offsetEncoding *huffmanEncoder
codegenEncoding *huffmanEncoder
@@ -101,7 +101,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
literalFreq: make([]int32, maxNumLit),
offsetFreq: make([]int32, offsetCodeCount),
codegen: make([]uint8, maxNumLit+offsetCodeCount+1),
codegenFreq: make([]int32, codegenCodeCount),
literalEncoding: newHuffmanEncoder(maxNumLit),
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
@@ -143,12 +142,13 @@ func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
w.bits >>= 48
w.nbits -= 48
n := w.nbytes
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n >= bufferFlushSize {
_, w.err = w.w.Write(w.bytes[:n])
@@ -293,12 +293,13 @@ func (w *huffmanBitWriter) writeCode(c hcode) {
w.bits >>= 48
w.nbits -= 48
n := w.nbytes
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n >= bufferFlushSize {
_, w.err = w.w.Write(w.bytes[:n])
@@ -428,13 +429,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
}
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
w.codegenEncoding.bitLength(w.codegenFreq) +
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
int64(extraBits) +
int64(w.codegenFreq[16]*2) +
int64(w.codegenFreq[17]*3) +
@@ -482,7 +483,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens := len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
@@ -609,13 +610,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
}
headerSize := int64(3+5+5+4+(3*numCodegens)) +
w.codegenEncoding.bitLength(w.codegenFreq) +
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
int64(w.codegenFreq[16]*2) +
int64(w.codegenFreq[17]*3) +
int64(w.codegenFreq[18]*7)
@@ -639,7 +640,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
// Huffman.
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
encoding := w.literalEncoding.codes
encoding := w.literalEncoding.codes[:257]
n := w.nbytes
for _, t := range input {
// Bitwriting inlined, ~30% speedup
@@ -653,12 +654,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
bits := w.bits
w.bits >>= 48
w.nbits -= 48
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n < bufferFlushSize {
continue
@@ -677,6 +679,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
//
// len(h) must be >= 256, and h's elements must be all zeroes.
func histogram(b []byte, h []int32) {
h = h[:256]
for _, t := range b {
h[t]++
}
@@ -96,8 +96,8 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
func generateFixedOffsetEncoding() *huffmanEncoder {
h := newHuffmanEncoder(30)
codes := h.codes
for ch := uint16(0); ch < 30; ch++ {
codes[ch] = hcode{code: reverseBits(ch, 5), len: 5}
for ch := range codes {
codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
}
return h
}

0 comments on commit f20b180

Please sign in to comment.