bzip2: improve performance

Improve performance of move-to-front by using cache-friendly copies instead of doubly-linked list. Simplify so that the underlying slice is the object. Remove the n=0 special case, which was actually slower with the copy approach. benchmark old ns/op new ns/op delta BenchmarkDecodeDigits 26429714 23859699 -9.72% BenchmarkDecodeTwain 76684510 67591946 -11.86% benchmark old MB/s new MB/s speedup BenchmarkDecodeDigits 1.63 1.81 1.11x BenchmarkDecodeTwain 1.63 1.85 1.13x Updates #6754. LGTM=adg, agl, josharian R=adg, agl, josharian CC=golang-codereviews https://golang.org/cl/131840043
golang · Aug 18, 2014 · 6d248ce · 6d248ce
1 parent 523aa93
commit 6d248ce
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 62 deletions.
diff --git a/src/pkg/compress/bzip2/bzip2_test.go b/src/pkg/compress/bzip2/bzip2_test.go
@@ -216,6 +216,44 @@ func TestOutOfRangeSelector(t *testing.T) {
 	ioutil.ReadAll(decompressor)
 }
 
+func TestMTF(t *testing.T) {
+	mtf := newMTFDecoderWithRange(5)
+
+	// 0 1 2 3 4
+	expect := byte(1)
+	x := mtf.Decode(1)
+	if x != expect {
+		t.Errorf("expected %v, got %v", expect, x)
+	}
+
+	// 1 0 2 3 4
+	x = mtf.Decode(0)
+	if x != expect {
+		t.Errorf("expected %v, got %v", expect, x)
+	}
+
+	// 1 0 2 3 4
+	expect = byte(0)
+	x = mtf.Decode(1)
+	if x != expect {
+		t.Errorf("expected %v, got %v", expect, x)
+	}
+
+	// 0 1 2 3 4
+	expect = byte(4)
+	x = mtf.Decode(4)
+	if x != expect {
+		t.Errorf("expected %v, got %v", expect, x)
+	}
+
+	// 4 0 1 2 3
+	expect = byte(0)
+	x = mtf.Decode(1)
+	if x != expect {
+		t.Errorf("expected %v, got %v", expect, x)
+	}
+}
+
 var bufferOverrunBase64 string = `
 QlpoNTFBWSZTWTzyiGcACMP/////////////////////////////////3/7f3///
 ////4N/fCZODak2Xo44GIHZgkGzDRbFAuwAAKoFV7T6AO6qwA6APb6s2rOoAkAAD

diff --git a/src/pkg/compress/bzip2/move_to_front.go b/src/pkg/compress/bzip2/move_to_front.go
@@ -11,88 +11,43 @@ package bzip2
 // index into that list. When a symbol is referenced, it's moved to the front
 // of the list. Thus, a repeated symbol ends up being encoded with many zeros,
 // as the symbol will be at the front of the list after the first access.
-type moveToFrontDecoder struct {
+type moveToFrontDecoder []byte
-	// Rather than actually keep the list in memory, the symbols are stored
-	// as a circular, double linked list with the symbol indexed by head
-	// at the front of the list.
-	symbols [256]byte
-	next    [256]uint8
-	prev    [256]uint8
-	head    uint8
-	len     int
-}
 
 // newMTFDecoder creates a move-to-front decoder with an explicit initial list
 // of symbols.
-func newMTFDecoder(symbols []byte) *moveToFrontDecoder {
+func newMTFDecoder(symbols []byte) moveToFrontDecoder {
 	if len(symbols) > 256 {
 		panic("too many symbols")
 	}
-
+	return moveToFrontDecoder(symbols)
-	m := new(moveToFrontDecoder)
-	copy(m.symbols[:], symbols)
-	m.len = len(symbols)
-	m.threadLinkedList()
-	return m
 }
 
 // newMTFDecoderWithRange creates a move-to-front decoder with an initial
 // symbol list of 0...n-1.
-func newMTFDecoderWithRange(n int) *moveToFrontDecoder {
+func newMTFDecoderWithRange(n int) moveToFrontDecoder {
 	if n > 256 {
 		panic("newMTFDecoderWithRange: cannot have > 256 symbols")
 	}
 
-	m := new(moveToFrontDecoder)
+	m := make([]byte, n)
 	for i := 0; i < n; i++ {
-		m.symbols[byte(i)] = byte(i)
+		m[i] = byte(i)
-	}
-	m.len = n
-	m.threadLinkedList()
-	return m
-}
-
-// threadLinkedList creates the initial linked-list pointers.
-func (m *moveToFrontDecoder) threadLinkedList() {
-	if m.len == 0 {
-		return
-	}
-
-	m.prev[0] = uint8(m.len - 1)
-
-	for i := byte(0); int(i) < m.len-1; i++ {
-		m.next[i] = uint8(i + 1)
-		m.prev[i+1] = uint8(i)
 	}
-
+	return moveToFrontDecoder(m)
-	m.next[m.len-1] = 0
 }
 
-func (m *moveToFrontDecoder) Decode(n int) (b byte) {
+func (m moveToFrontDecoder) Decode(n int) (b byte) {
-	// Most of the time, n will be zero so it's worth dealing with this
+	// Implement move-to-front with a simple copy. This approach
-	// simple case.
+	// beats more sophisticated approaches in benchmarking, probably
-	if n == 0 {
+	// because it has high locality of reference inside of a
-		return m.symbols[m.head]
+	// single cache line (most move-to-front operations have n < 64).
-	}
+	b = m[n]
-
+	copy(m[1:], m[:n])
-	i := m.head
+	m[0] = b
-	for j := 0; j < n; j++ {
-		i = m.next[i]
-	}
-	b = m.symbols[i]
-
-	m.next[m.prev[i]] = m.next[i]
-	m.prev[m.next[i]] = m.prev[i]
-	m.next[i] = m.head
-	m.prev[i] = m.prev[m.head]
-	m.next[m.prev[m.head]] = i
-	m.prev[m.head] = i
-	m.head = i
-
 	return
 }
 
 // First returns the symbol at the front of the list.
-func (m *moveToFrontDecoder) First() byte {
+func (m moveToFrontDecoder) First() byte {
-	return m.symbols[m.head]
+	return m[0]
 }