diff --git a/mat64/dense_arithmetic.go b/mat64/dense_arithmetic.go
index 725e89c..aefa7c1 100644
--- a/mat64/dense_arithmetic.go
+++ b/mat64/dense_arithmetic.go
@@ -351,7 +351,6 @@ func (m *Dense) Dot(b Matrix) float64 {
 }
 
 // Mul takes the matrix product of a and b, placing the result in the receiver.
-// If a or b are also the receiver, temporary workspace memory will be allocated.
 //
 // See the Muler interface for more information.
 func (m *Dense) Mul(a, b Matrix) {
@@ -362,22 +361,22 @@ func (m *Dense) Mul(a, b Matrix) {
 		panic(ErrShape)
 	}
 
+	m.reuseAs(ar, bc)
 	var w *Dense
 	if m != a && m != b {
 		w = m
 	} else {
-		m.reuseAs(ar, bc)
-		w = &Dense{}
+		w = getWorkspace(ar, bc, false)
+		defer func() {
+			m.Copy(w)
+			putWorkspace(w)
+		}()
 	}
-	w.reuseAs(ar, bc)
 
 	if a, ok := a.(RawMatrixer); ok {
 		if b, ok := b.(RawMatrixer); ok {
 			amat, bmat := a.RawMatrix(), b.RawMatrix()
 			blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, amat, bmat, 0, w.mat)
-			if w != m {
-				m.Copy(w)
-			}
 			return
 		}
 	}
@@ -395,9 +394,6 @@ func (m *Dense) Mul(a, b Matrix) {
 					)
 				}
 			}
-			if w != m {
-				m.Copy(w)
-			}
 			return
 		}
 	}
@@ -415,14 +411,10 @@ func (m *Dense) Mul(a, b Matrix) {
 			w.mat.Data[r*w.mat.Stride+c] = v
 		}
 	}
-	if w != m {
-		m.Copy(w)
-	}
 }
 
 // MulTrans takes the matrix product of a and b, optionally transposing each,
-// and placing the result in the receiver. If a or b are also the receiver,
-// temporary workspace memory will be allocated.
+// and placing the result in the receiver.
 //
 // See the MulTranser interface for more information.
 func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
@@ -440,14 +432,17 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 		panic(ErrShape)
 	}
 
+	m.reuseAs(ar, bc)
 	var w *Dense
 	if m != a && m != b {
 		w = m
 	} else {
-		m.reuseAs(ar, bc)
-		w = &Dense{}
+		w = getWorkspace(ar, bc, false)
+		defer func() {
+			m.Copy(w)
+			putWorkspace(w)
+		}()
 	}
-	w.reuseAs(ar, bc)
 
 	if a, ok := a.(RawMatrixer); ok {
 		if b, ok := b.(RawMatrixer); ok {
@@ -483,10 +478,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 				bmat := b.RawMatrix()
 				blas64.Gemm(aOp, bOp, 1, amat, bmat, 0, w.mat)
 			}
-
-			if w != m {
-				m.Copy(w)
-			}
 			return
 		}
 	}
@@ -506,9 +497,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 							)
 						}
 					}
-					if w != m {
-						m.Copy(w)
-					}
 					return
 				}
 				// TODO(jonlawlor): determine if (b*a)' is more efficient
@@ -521,9 +509,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 						)
 					}
 				}
-				if w != m {
-					m.Copy(w)
-				}
 				return
 			}
 			if bTrans {
@@ -536,9 +521,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 						)
 					}
 				}
-				if w != m {
-					m.Copy(w)
-				}
 				return
 			}
 			for r := 0; r < ar; r++ {
@@ -550,9 +532,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 					)
 				}
 			}
-			if w != m {
-				m.Copy(w)
-			}
 			return
 		}
 	}
@@ -573,9 +552,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 					dataTmp[c] = v
 				}
 			}
-			if w != m {
-				m.Copy(w)
-			}
 			return
 		}
 
@@ -592,9 +568,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 				dataTmp[c] = v
 			}
 		}
-		if w != m {
-			m.Copy(w)
-		}
 		return
 	}
 	if bTrans {
@@ -611,9 +584,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 				dataTmp[c] = v
 			}
 		}
-		if w != m {
-			m.Copy(w)
-		}
 		return
 	}
 	for r := 0; r < ar; r++ {
@@ -629,9 +599,6 @@ func (m *Dense) MulTrans(a Matrix, aTrans bool, b Matrix, bTrans bool) {
 			dataTmp[c] = v
 		}
 	}
-	if w != m {
-		m.Copy(w)
-	}
 }
 
 // Exp calculates the exponential of the matrix a, e^a, placing the result
@@ -734,20 +701,25 @@ func (m *Dense) Pow(a Matrix, n int) {
 	}
 
 	// Perform iterative exponentiation by squaring in work space.
-	var w, s, x Dense
-	w.Clone(a)
-	s.Clone(a)
+	w := getWorkspace(r, r, false)
+	w.Copy(a)
+	s := getWorkspace(r, r, false)
+	s.Copy(a)
+	x := getWorkspace(r, r, false)
 	for n--; n > 0; n >>= 1 {
 		if n&1 != 0 {
-			x.Mul(&w, &s)
+			x.Mul(w, s)
 			w, x = x, w
 		}
 		if n != 1 {
-			x.Mul(&s, &s)
+			x.Mul(s, s)
 			s, x = x, s
 		}
 	}
-	m.Copy(&w)
+	m.Copy(w)
+	putWorkspace(w)
+	putWorkspace(s)
+	putWorkspace(x)
 }
 
 // Scale multiplies the elements of a by f, placing the result in the receiver.
diff --git a/mat64/pool.go b/mat64/pool.go
new file mode 100644
index 0000000..62faf6a
--- /dev/null
+++ b/mat64/pool.go
@@ -0,0 +1,80 @@
+// Copyright ©2014 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat64
+
+import (
+	"sync"
+
+	"github.com/gonum/blas/blas64"
+)
+
+var tab64 = [64]byte{
+	0x3f, 0x00, 0x3a, 0x01, 0x3b, 0x2f, 0x35, 0x02,
+	0x3c, 0x27, 0x30, 0x1b, 0x36, 0x21, 0x2a, 0x03,
+	0x3d, 0x33, 0x25, 0x28, 0x31, 0x12, 0x1c, 0x14,
+	0x37, 0x1e, 0x22, 0x0b, 0x2b, 0x0e, 0x16, 0x04,
+	0x3e, 0x39, 0x2e, 0x34, 0x26, 0x1a, 0x20, 0x29,
+	0x32, 0x24, 0x11, 0x13, 0x1d, 0x0a, 0x0d, 0x15,
+	0x38, 0x2d, 0x19, 0x1f, 0x23, 0x10, 0x09, 0x0c,
+	0x2c, 0x18, 0x0f, 0x08, 0x17, 0x07, 0x06, 0x05,
+}
+
+// bits returns the ceiling of base 2 log of v.
+// Approach based on http://stackoverflow.com/a/11398748.
+func bits(v uint64) byte {
+	if v == 0 {
+		return 0
+	}
+	v <<= 2
+	v--
+	v |= v >> 1
+	v |= v >> 2
+	v |= v >> 4
+	v |= v >> 8
+	v |= v >> 16
+	v |= v >> 32
+	return tab64[((v-(v>>1))*0x07EDD5E59A4E28C2)>>58] - 1
+}
+
+// pool contains size stratified workspace Dense pools.
+// Each pool element i returns sized matrices with a data
+// slice capped at 1<<i.
+var pool [63]sync.Pool
+
+func init() {
+	for i := range pool {
+		l := 1 << uint(i)
+		pool[i].New = func() interface{} {
+			return &Dense{mat: blas64.General{
+				Data: make([]float64, l),
+			}}
+		}
+	}
+}
+
+// getWorkspace returns a *Dense of size r×c and a data slice
+// with a cap that is less than 2*r*c. If clear is true, the
+// data slice visible through the Matrix interface is zeroed.
+func getWorkspace(r, c int, clear bool) *Dense {
+	l := uint64(r * c)
+	w := pool[bits(l)].Get().(*Dense)
+	w.mat.Data = w.mat.Data[:l]
+	if clear {
+		zero(w.mat.Data)
+	}
+	w.mat.Rows = r
+	w.mat.Cols = c
+	w.mat.Stride = c
+	w.capRows = r
+	w.capCols = c
+	return w
+}
+
+// putWorkspace replaces a used *Dense into the appropriate size
+// workspace pool. putWorkspace must not be called with a matrix
+// where references to the underlying data slice has been kept.
+func putWorkspace(w *Dense) {
+	pool[bits(uint64(cap(w.mat.Data)))].Put(w)
+}
diff --git a/mat64/pool_test.go b/mat64/pool_test.go
new file mode 100644
index 0000000..aa0e83d
--- /dev/null
+++ b/mat64/pool_test.go
@@ -0,0 +1,79 @@
+// Copyright ©2014 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package mat64
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+
+	"gopkg.in/check.v1"
+)
+
+func (s *S) TestPool(c *check.C) {
+	for i := 1; i < 10; i++ {
+		for j := 1; j < 10; j++ {
+			var w, m *Dense
+			for k := 0; k < 5; k++ {
+				w = getWorkspace(i, j, true)
+				m = NewDense(i, j, nil)
+				c.Check(w.mat, check.DeepEquals, m.mat)
+				c.Check(w.capRows, check.DeepEquals, m.capRows)
+				c.Check(w.capCols, check.DeepEquals, m.capCols)
+				c.Check(cap(w.mat.Data) < 2*len(w.mat.Data), check.Equals, true, check.Commentf("r: %d c: %d -> len: %d cap: %d", i, j, len(w.mat.Data), cap(w.mat.Data)))
+			}
+			w.Set(0, 0, math.NaN())
+			for k := 0; k < 5; k++ {
+				putWorkspace(w)
+			}
+			for k := 0; k < 5; k++ {
+				w = getWorkspace(i, j, true)
+				m = NewDense(i, j, nil)
+				c.Check(w.mat, check.DeepEquals, m.mat)
+				c.Check(w.capRows, check.DeepEquals, m.capRows)
+				c.Check(w.capCols, check.DeepEquals, m.capCols)
+				c.Check(cap(w.mat.Data) < 2*len(w.mat.Data), check.Equals, true, check.Commentf("r: %d c: %d -> len: %d cap: %d", i, j, len(w.mat.Data), cap(w.mat.Data)))
+			}
+		}
+	}
+}
+
+var benchmat *Dense
+
+func poolBenchmark(n, r, c int, clear bool) {
+	for i := 0; i < n; i++ {
+		benchmat = getWorkspace(r, c, clear)
+		putWorkspace(benchmat)
+	}
+}
+
+func newBenchmark(n, r, c int) {
+	for i := 0; i < n; i++ {
+		benchmat = NewDense(r, c, nil)
+	}
+}
+
+func BenchmarkPool10by10Uncleared(b *testing.B)   { poolBenchmark(b.N, 10, 10, false) }
+func BenchmarkPool10by10Cleared(b *testing.B)     { poolBenchmark(b.N, 10, 10, true) }
+func BenchmarkNew10by10(b *testing.B)             { newBenchmark(b.N, 10, 10) }
+func BenchmarkPool100by100Uncleared(b *testing.B) { poolBenchmark(b.N, 100, 100, false) }
+func BenchmarkPool100by100Cleared(b *testing.B)   { poolBenchmark(b.N, 100, 100, true) }
+func BenchmarkNew100by100(b *testing.B)           { newBenchmark(b.N, 100, 100) }
+
+func BenchmarkMulWorkspaceDense100Half(b *testing.B)        { denseMulWorkspaceBench(b, 100, 0.5) }
+func BenchmarkMulWorkspaceDense100Tenth(b *testing.B)       { denseMulWorkspaceBench(b, 100, 0.1) }
+func BenchmarkMulWorkspaceDense1000Half(b *testing.B)       { denseMulWorkspaceBench(b, 1000, 0.5) }
+func BenchmarkMulWorkspaceDense1000Tenth(b *testing.B)      { denseMulWorkspaceBench(b, 1000, 0.1) }
+func BenchmarkMulWorkspaceDense1000Hundredth(b *testing.B)  { denseMulWorkspaceBench(b, 1000, 0.01) }
+func BenchmarkMulWorkspaceDense1000Thousandth(b *testing.B) { denseMulWorkspaceBench(b, 1000, 0.001) }
+func denseMulWorkspaceBench(b *testing.B, size int, rho float64) {
+	b.StopTimer()
+	a, _ := randDense(size, rho, rand.NormFloat64)
+	d, _ := randDense(size, rho, rand.NormFloat64)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		a.Mul(a, d)
+	}
+}