diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 702fa399012..f8265075dff 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -24,8 +24,8 @@ }, { "ImportPath": "github.com/boltdb/bolt", - "Comment": "v1.0-119-g90fef38", - "Rev": "90fef389f98027ca55594edd7dbd6e7f3926fdad" + "Comment": "v1.0-158-g81db894", + "Rev": "81db89446cb805bc352f803151f47fea849241e2" }, { "ImportPath": "github.com/bradfitz/http2", diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/README.md b/Godeps/_workspace/src/github.com/boltdb/bolt/README.md index 00fad6afb8b..80353ab555f 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/README.md +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/README.md @@ -1,8 +1,8 @@ Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png) ==== -Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and -the [LMDB project][lmdb]. The goal of the project is to provide a simple, +Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] +[LMDB project][lmdb]. The goal of the project is to provide a simple, fast, and reliable database for projects that don't require a full database server such as Postgres or MySQL. @@ -180,8 +180,8 @@ and then safely close your transaction if an error is returned. This is the recommended way to use Bolt transactions. However, sometimes you may want to manually start and end your transactions. -You can use the `Tx.Begin()` function directly but _please_ be sure to close the -transaction. +You can use the `Tx.Begin()` function directly but **please** be sure to close +the transaction. ```go // Start a writable transaction. @@ -256,7 +256,7 @@ db.View(func(tx *bolt.Tx) error { ``` The `Get()` function does not return an error because its operation is -guarenteed to work (unless there is some kind of system failure). If the key +guaranteed to work (unless there is some kind of system failure). If the key exists then it will return its byte slice value. If it doesn't exist then it will return `nil`. It's important to note that you can have a zero-length value set to a key which is different than the key not existing. @@ -268,6 +268,50 @@ transaction is open. If you need to use a value outside of the transaction then you must use `copy()` to copy it to another byte slice. +### Autoincrementing integer for the bucket +By using the NextSequence() function, you can let Bolt determine a sequence +which can be used as the unique identifier for your key/value pairs. See the +example below. + +```go +// CreateUser saves u to the store. The new user ID is set on u once the data is persisted. +func (s *Store) CreateUser(u *User) error { + return s.db.Update(func(tx *bolt.Tx) error { + // Retrieve the users bucket. + // This should be created when the DB is first opened. + b := tx.Bucket([]byte("users")) + + // Generate ID for the user. + // This returns an error only if the Tx is closed or not writeable. + // That can't happen in an Update() call so I ignore the error check. + id, _ = b.NextSequence() + u.ID = int(id) + + // Marshal user data into bytes. + buf, err := json.Marshal(u) + if err != nil { + return err + } + + // Persist bytes to users bucket. + return b.Put(itob(u.ID), buf) + }) +} + +// itob returns an 8-byte big endian representation of v. +func itob(v int) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, uint64(v)) + return b +} + +type User struct { + ID int + ... +} + +``` + ### Iterating over keys Bolt stores its keys in byte-sorted order within a bucket. This makes sequential @@ -382,8 +426,11 @@ func (*Bucket) DeleteBucket(key []byte) error Bolt is a single file so it's easy to backup. You can use the `Tx.WriteTo()` function to write a consistent view of the database to a writer. If you call this from a read-only transaction, it will perform a hot backup and not block -your other database reads and writes. It will also use `O_DIRECT` when available -to prevent page cache trashing. +your other database reads and writes. + +By default, it will use a regular file handle which will utilize the operating +system's page cache. See the [`Tx`](https://godoc.org/github.com/boltdb/bolt#Tx) +documentation for information about optimizing for larger-than-RAM datasets. One common use case is to backup over HTTP so you can use tools like `cURL` to do database backups: @@ -500,7 +547,7 @@ they are libraries bundled into the application, however, their underlying structure is a log-structured merge-tree (LSM tree). An LSM tree optimizes random writes by using a write ahead log and multi-tiered, sorted files called SSTables. Bolt uses a B+tree internally and only a single file. Both approaches -have trade offs. +have trade-offs. If you require a high random write throughput (>10,000 w/sec) or you need to use spinning disks then LevelDB could be a good choice. If your application is @@ -568,7 +615,9 @@ Here are a few things to note when evaluating and using Bolt: can in memory and will release memory as needed to other processes. This means that Bolt can show very high memory usage when working with large databases. However, this is expected and the OS will release memory as needed. Bolt can - handle databases much larger than the available physical RAM. + handle databases much larger than the available physical RAM, provided its + memory-map fits in the process virtual address space. It may be problematic + on 32-bits systems. * The data structures in the Bolt database are memory mapped so the data file will be endian specific. This means that you cannot copy a Bolt file from a @@ -587,6 +636,56 @@ Here are a few things to note when evaluating and using Bolt: [page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638 +## Reading the Source + +Bolt is a relatively small code base (<3KLOC) for an embedded, serializable, +transactional key/value database so it can be a good starting point for people +interested in how databases work. + +The best places to start are the main entry points into Bolt: + +- `Open()` - Initializes the reference to the database. It's responsible for + creating the database if it doesn't exist, obtaining an exclusive lock on the + file, reading the meta pages, & memory-mapping the file. + +- `DB.Begin()` - Starts a read-only or read-write transaction depending on the + value of the `writable` argument. This requires briefly obtaining the "meta" + lock to keep track of open transactions. Only one read-write transaction can + exist at a time so the "rwlock" is acquired during the life of a read-write + transaction. + +- `Bucket.Put()` - Writes a key/value pair into a bucket. After validating the + arguments, a cursor is used to traverse the B+tree to the page and position + where they key & value will be written. Once the position is found, the bucket + materializes the underlying page and the page's parent pages into memory as + "nodes". These nodes are where mutations occur during read-write transactions. + These changes get flushed to disk during commit. + +- `Bucket.Get()` - Retrieves a key/value pair from a bucket. This uses a cursor + to move to the page & position of a key/value pair. During a read-only + transaction, the key and value data is returned as a direct reference to the + underlying mmap file so there's no allocation overhead. For read-write + transactions, this data may reference the mmap file or one of the in-memory + node values. + +- `Cursor` - This object is simply for traversing the B+tree of on-disk pages + or in-memory nodes. It can seek to a specific key, move to the first or last + value, or it can move forward or backward. The cursor handles the movement up + and down the B+tree transparently to the end user. + +- `Tx.Commit()` - Converts the in-memory dirty nodes and the list of free pages + into pages to be written to disk. Writing to disk then occurs in two phases. + First, the dirty pages are written to disk and an `fsync()` occurs. Second, a + new meta page with an incremented transaction ID is written and another + `fsync()` occurs. This two phase write ensures that partially written data + pages are ignored in the event of a crash since the meta page pointing to them + is never written. Partially written meta pages are invalidated because they + are written with a checksum. + +If you have additional notes that could be helpful for others, please submit +them via pull request. + + ## Other Projects Using Bolt Below is a list of public, open source projects that use Bolt: @@ -615,7 +714,11 @@ Below is a list of public, open source projects that use Bolt: * [Freehold](http://tshannon.bitbucket.org/freehold/) - An open, secure, and lightweight platform for your files and data. * [Prometheus Annotation Server](https://github.com/oliver006/prom_annotation_server) - Annotation server for PromDash & Prometheus service monitoring system. * [Consul](https://github.com/hashicorp/consul) - Consul is service discovery and configuration made easy. Distributed, highly available, and datacenter-aware. -* [Kala](https://github.com/ajvb/kala) - Kala is a modern job scheduler optimized to run on a single node. It is persistant, JSON over HTTP API, ISO 8601 duration notation, and dependent jobs. +* [Kala](https://github.com/ajvb/kala) - Kala is a modern job scheduler optimized to run on a single node. It is persistent, JSON over HTTP API, ISO 8601 duration notation, and dependent jobs. * [drive](https://github.com/odeke-em/drive) - drive is an unofficial Google Drive command line client for \*NIX operating systems. +* [stow](https://github.com/djherbis/stow) - a persistence manager for objects + backed by boltdb. +* [buckets](https://github.com/joyrexus/buckets) - a bolt wrapper streamlining + simple tx and key scans. If you are using Bolt in a project please send a pull request to add it to the list. diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm64.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm64.go new file mode 100644 index 00000000000..6d2309352e0 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm64.go @@ -0,0 +1,9 @@ +// +build arm64 + +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go index e9d1c907b63..2b676661409 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go @@ -4,8 +4,6 @@ import ( "syscall" ) -var odirect = syscall.O_DIRECT - // fdatasync flushes written data to a file descriptor. func fdatasync(db *DB) error { return syscall.Fdatasync(int(db.file.Fd())) diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go index 7c1bef1a4f4..7058c3d734e 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go @@ -11,8 +11,6 @@ const ( msInvalidate // invalidate cached data ) -var odirect int - func msync(db *DB) error { _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate) if errno != 0 { diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_ppc64le.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_ppc64le.go new file mode 100644 index 00000000000..8351e129f6a --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_ppc64le.go @@ -0,0 +1,9 @@ +// +build ppc64le + +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_s390x.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_s390x.go new file mode 100644 index 00000000000..f4dd26bbba7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_s390x.go @@ -0,0 +1,9 @@ +// +build s390x + +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go index 8b782be5f9e..91c4968f6a1 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go @@ -8,7 +8,37 @@ import ( "unsafe" ) -var odirect int +// LockFileEx code derived from golang build filemutex_windows.go @ v1.5.1 +var ( + modkernel32 = syscall.NewLazyDLL("kernel32.dll") + procLockFileEx = modkernel32.NewProc("LockFileEx") + procUnlockFileEx = modkernel32.NewProc("UnlockFileEx") +) + +const ( + // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx + flagLockExclusive = 2 + flagLockFailImmediately = 1 + + // see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx + errLockViolation syscall.Errno = 0x21 +) + +func lockFileEx(h syscall.Handle, flags, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) { + r, _, err := procLockFileEx.Call(uintptr(h), uintptr(flags), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol))) + if r == 0 { + return err + } + return nil +} + +func unlockFileEx(h syscall.Handle, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) { + r, _, err := procUnlockFileEx.Call(uintptr(h), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)), 0) + if r == 0 { + return err + } + return nil +} // fdatasync flushes written data to a file descriptor. func fdatasync(db *DB) error { @@ -16,13 +46,37 @@ func fdatasync(db *DB) error { } // flock acquires an advisory lock on a file descriptor. -func flock(f *os.File, _ bool, _ time.Duration) error { - return nil +func flock(f *os.File, exclusive bool, timeout time.Duration) error { + var t time.Time + for { + // If we're beyond our timeout then return an error. + // This can only occur after we've attempted a flock once. + if t.IsZero() { + t = time.Now() + } else if timeout > 0 && time.Since(t) > timeout { + return ErrTimeout + } + + var flag uint32 = flagLockFailImmediately + if exclusive { + flag |= flagLockExclusive + } + + err := lockFileEx(syscall.Handle(f.Fd()), flag, 0, 1, 0, &syscall.Overlapped{}) + if err == nil { + return nil + } else if err != errLockViolation { + return err + } + + // Wait for a bit and try again. + time.Sleep(50 * time.Millisecond) + } } // funlock releases an advisory lock on a file descriptor. func funlock(f *os.File) error { - return nil + return unlockFileEx(syscall.Handle(f.Fd()), 0, 1, 0, &syscall.Overlapped{}) } // mmap memory maps a DB's data file. diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go b/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go index 8db89776fe6..f50442523c3 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go @@ -2,8 +2,6 @@ package bolt -var odirect int - // fdatasync flushes written data to a file descriptor. func fdatasync(db *DB) error { return db.file.Sync() diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go index 6766992100f..d2f8c524e42 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go @@ -11,7 +11,7 @@ const ( MaxKeySize = 32768 // MaxValueSize is the maximum length of a value, in bytes. - MaxValueSize = 4294967295 + MaxValueSize = (1 << 31) - 2 ) const ( @@ -99,6 +99,7 @@ func (b *Bucket) Cursor() *Cursor { // Bucket retrieves a nested bucket by name. // Returns nil if the bucket does not exist. +// The bucket instance is only valid for the lifetime of the transaction. func (b *Bucket) Bucket(name []byte) *Bucket { if b.buckets != nil { if child := b.buckets[string(name)]; child != nil { @@ -148,6 +149,7 @@ func (b *Bucket) openBucket(value []byte) *Bucket { // CreateBucket creates a new bucket at the given key and returns the new bucket. // Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { if b.tx.db == nil { return nil, ErrTxClosed @@ -192,6 +194,7 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { // CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it. // Returns an error if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { child, err := b.CreateBucket(key) if err == ErrBucketExists { @@ -270,6 +273,7 @@ func (b *Bucket) Get(key []byte) []byte { // Put sets the value for a key in the bucket. // If the key exist then its previous value will be overwritten. +// Supplied value must remain valid for the life of the transaction. // Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large. func (b *Bucket) Put(key []byte, value []byte) error { if b.tx.db == nil { @@ -346,7 +350,8 @@ func (b *Bucket) NextSequence() (uint64, error) { // ForEach executes a function for each key/value pair in a bucket. // If the provided function returns an error then the iteration is stopped and -// the error is returned to the caller. +// the error is returned to the caller. The provided function must not modify +// the bucket; this will result in undefined behavior. func (b *Bucket) ForEach(fn func(k, v []byte) error) error { if b.tx.db == nil { return ErrTxClosed diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go index 99292b45885..a68a4d615ba 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go @@ -253,7 +253,7 @@ func TestBucket_Delete_FreelistOverflow(t *testing.T) { b := tx.Bucket([]byte("0")) c := b.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { - b.Delete(k) + c.Delete() } return nil }) diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go index 2b1566a4ebe..16f935ec2e5 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go @@ -39,9 +39,6 @@ func TestOpen(t *testing.T) { // Ensure that opening an already open database file will timeout. func TestOpen_Timeout(t *testing.T) { - if runtime.GOOS == "windows" { - t.Skip("timeout not supported on windows") - } if runtime.GOOS == "solaris" { t.Skip("solaris fcntl locks don't support intra-process locking") } @@ -66,9 +63,6 @@ func TestOpen_Timeout(t *testing.T) { // Ensure that opening an already open database file will wait until its closed. func TestOpen_Wait(t *testing.T) { - if runtime.GOOS == "windows" { - t.Skip("timeout not supported on windows") - } if runtime.GOOS == "solaris" { t.Skip("solaris fcntl locks don't support intra-process locking") } @@ -622,7 +616,7 @@ func TestDB_Consistency(t *testing.T) { }) } -// Ensure that DB stats can be substracted from one another. +// Ensure that DB stats can be subtracted from one another. func TestDBStats_Sub(t *testing.T) { var a, b bolt.Stats a.TxStats.PageCount = 3 diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go b/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go index 6b52b2c8964..3273106e1c5 100644 --- a/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go @@ -29,6 +29,14 @@ type Tx struct { pages map[pgid]*page stats TxStats commitHandlers []func() + + // WriteFlag specifies the flag for write-related methods like WriteTo(). + // Tx opens the database file with the specified flag to copy the data. + // + // By default, the flag is unset, which works well for mostly in-memory + // workloads. For databases that are much larger than available RAM, + // set the flag to syscall.O_DIRECT to avoid trashing the page cache. + WriteFlag int } // init initializes the transaction. @@ -87,18 +95,21 @@ func (tx *Tx) Stats() TxStats { // Bucket retrieves a bucket by name. // Returns nil if the bucket does not exist. +// The bucket instance is only valid for the lifetime of the transaction. func (tx *Tx) Bucket(name []byte) *Bucket { return tx.root.Bucket(name) } // CreateBucket creates a new bucket. // Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) { return tx.root.CreateBucket(name) } // CreateBucketIfNotExists creates a new bucket if it doesn't already exist. // Returns an error if the bucket name is blank, or if the bucket name is too long. +// The bucket instance is only valid for the lifetime of the transaction. func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) { return tx.root.CreateBucketIfNotExists(name) } @@ -236,7 +247,8 @@ func (tx *Tx) close() { var freelistPendingN = tx.db.freelist.pending_count() var freelistAlloc = tx.db.freelist.size() - // Remove writer lock. + // Remove transaction ref & writer lock. + tx.db.rwtx = nil tx.db.rwlock.Unlock() // Merge statistics. @@ -250,7 +262,12 @@ func (tx *Tx) close() { } else { tx.db.removeTx(tx) } + + // Clear all references. tx.db = nil + tx.meta = nil + tx.root = Bucket{tx: tx} + tx.pages = nil } // Copy writes the entire database to a writer. @@ -263,21 +280,18 @@ func (tx *Tx) Copy(w io.Writer) error { // WriteTo writes the entire database to a writer. // If err == nil then exactly tx.Size() bytes will be written into the writer. func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { - // Attempt to open reader directly. - var f *os.File - if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil { - // Fallback to a regular open if that doesn't work. - if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil { - return 0, err - } + // Attempt to open reader with WriteFlag + f, err := os.OpenFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0) + if err != nil { + return 0, err } + defer f.Close() // Copy the meta pages. tx.db.metalock.Lock() n, err = io.CopyN(w, f, int64(tx.db.pageSize*2)) tx.db.metalock.Unlock() if err != nil { - _ = f.Close() return n, fmt.Errorf("meta copy: %s", err) } @@ -285,7 +299,6 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)) n += wn if err != nil { - _ = f.Close() return n, err }