Skip to content

Commit

Permalink
combine memtables before flushing to L0
Browse files Browse the repository at this point in the history
Taken from PR #1696, commit b21f591
  • Loading branch information
mangalaman93 committed Feb 14, 2023
1 parent 5eb141c commit a1e71f3
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 32 deletions.
102 changes: 71 additions & 31 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -758,13 +758,6 @@ var requestPool = sync.Pool{
}

func (db *DB) writeToLSM(b *request) error {
// We should check the length of b.Prts and b.Entries only when badger is not
// running in InMemory mode. In InMemory mode, we don't write anything to the
// value log and that's why the length of b.Ptrs will always be zero.
if !db.opt.InMemory && len(b.Ptrs) != len(b.Entries) {
return errors.Errorf("Ptrs and Entries don't match: %+v", b)
}

for i, entry := range b.Entries {
var err error
if entry.skipVlogAndSetThreshold(db.valueThreshold()) {
Expand Down Expand Up @@ -829,6 +822,7 @@ func (db *DB) writeRequests(reqs []*request) error {
}
count += len(b.Entries)
var i uint64
var err error
for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
i++
if i%100 == 0 {
Expand Down Expand Up @@ -945,7 +939,8 @@ func (db *DB) doWrites(lc *z.Closer) {

// batchSet applies a list of badger.Entry. If a request level error occurs it
// will be returned.
// Check(kv.BatchSet(entries))
//
// Check(kv.BatchSet(entries))
func (db *DB) batchSet(entries []*Entry) error {
req, err := db.sendToWriteCh(entries)
if err != nil {
Expand All @@ -958,9 +953,10 @@ func (db *DB) batchSet(entries []*Entry) error {
// batchSetAsync is the asynchronous version of batchSet. It accepts a callback
// function which is called when all the sets are complete. If a request level
// error occurs, it will be passed back via the callback.
// err := kv.BatchSetAsync(entries, func(err error)) {
// Check(err)
// }
//
// err := kv.BatchSetAsync(entries, func(err error)) {
// Check(err)
// }
func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error {
req, err := db.sendToWriteCh(entries)
if err != nil {
Expand Down Expand Up @@ -1011,10 +1007,16 @@ func arenaSize(opt Options) int64 {

// buildL0Table builds a new table from the memtable.
func buildL0Table(ft flushTask, bopts table.Options) *table.Builder {
iter := ft.mt.sl.NewIterator()
var iter y.Iterator
if ft.itr != nil {
iter = ft.itr
} else {
iter = ft.mt.sl.NewUniIterator(false)
}
defer iter.Close()

b := table.NewTableBuilder(bopts)
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
for iter.Rewind(); iter.Valid(); iter.Next() {
if len(ft.dropPrefixes) > 0 && hasAnyPrefixes(iter.Key(), ft.dropPrefixes) {
continue
}
Expand All @@ -1030,16 +1032,13 @@ func buildL0Table(ft flushTask, bopts table.Options) *table.Builder {

type flushTask struct {
mt *memTable
itr y.Iterator
dropPrefixes [][]byte
}

// handleFlushTask must be run serially.
func (db *DB) handleFlushTask(ft flushTask) error {
// There can be a scenario, when empty memtable is flushed.
if ft.mt.sl.Empty() {
return nil
}

// ft.mt could be nil with ft.itr being the valid field.
bopts := buildTableOptions(db)
builder := buildL0Table(ft, bopts)
defer builder.Close()
Expand Down Expand Up @@ -1075,11 +1074,48 @@ func (db *DB) handleFlushTask(ft flushTask) error {
func (db *DB) flushMemtable(lc *z.Closer) error {
defer lc.Done()

var sz int64
var itrs []y.Iterator
var mts []*memTable
slurp := func() {
for {
select {
case more := <-db.flushChan:
if more.mt == nil {
return
}
sl := more.mt.sl
itrs = append(itrs, sl.NewUniIterator(false))
mts = append(mts, more.mt)

sz += sl.MemSize()
if sz > db.opt.MemTableSize {
return
}
default:
return
}
}
}

for ft := range db.flushChan {
if ft.mt == nil {
// We close db.flushChan now, instead of sending a nil ft.mt.
continue
}
sz = ft.mt.sl.MemSize()
// Reset of itrs, mts etc. is being done below.
y.AssertTrue(len(itrs) == 0 && len(mts) == 0)
itrs = append(itrs, ft.mt.sl.NewUniIterator(false))
mts = append(mts, ft.mt)

// Pick more memtables, so we can really fill up the L0 table.
slurp()

// db.opt.Infof("Picked %d memtables. Size: %d\n", len(itrs), sz)
ft.mt = nil
ft.itr = table.NewMergeIterator(itrs, false)

for {
err := db.handleFlushTask(ft)
if err == nil {
Expand All @@ -1090,9 +1126,11 @@ func (db *DB) flushMemtable(lc *z.Closer) error {
// which would arrive here would match db.imm[0], because we acquire a
// lock over DB when pushing to flushChan.
// TODO: This logic is dirty AF. Any change and this could easily break.
y.AssertTrue(ft.mt == db.imm[0])
db.imm = db.imm[1:]
ft.mt.DecrRef() // Return memory.
for _, mt := range mts {
y.AssertTrue(mt == db.imm[0])
db.imm = db.imm[1:]
mt.DecrRef() // Return memory.
}
db.lock.Unlock()

break
Expand All @@ -1101,6 +1139,8 @@ func (db *DB) flushMemtable(lc *z.Closer) error {
db.opt.Errorf("Failure while flushing memtable to disk: %v. Retrying...\n", err)
time.Sleep(time.Second)
}
// Reset everything.
itrs, mts, sz = itrs[:0], mts[:0], 0
}
return nil
}
Expand Down Expand Up @@ -1719,16 +1759,16 @@ func (db *DB) dropAll() (func(), error) {
}

// DropPrefix would drop all the keys with the provided prefix. It does this in the following way:
// - Stop accepting new writes.
// - Stop memtable flushes before acquiring lock. Because we're acquring lock here
// and memtable flush stalls for lock, which leads to deadlock
// - Flush out all memtables, skipping over keys with the given prefix, Kp.
// - Write out the value log header to memtables when flushing, so we don't accidentally bring Kp
// back after a restart.
// - Stop compaction.
// - Compact L0->L1, skipping over Kp.
// - Compact rest of the levels, Li->Li, picking tables which have Kp.
// - Resume memtable flushes, compactions and writes.
// - Stop accepting new writes.
// - Stop memtable flushes before acquiring lock. Because we're acquring lock here
// and memtable flush stalls for lock, which leads to deadlock
// - Flush out all memtables, skipping over keys with the given prefix, Kp.
// - Write out the value log header to memtables when flushing, so we don't accidentally bring Kp
// back after a restart.
// - Stop compaction.
// - Compact L0->L1, skipping over Kp.
// - Compact rest of the levels, Li->Li, picking tables which have Kp.
// - Resume memtable flushes, compactions and writes.
func (db *DB) DropPrefix(prefixes ...[]byte) error {
if len(prefixes) == 0 {
return nil
Expand Down
2 changes: 1 addition & 1 deletion options.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func DefaultOptions(path string) Options {
NumCompactors: 4, // Run at least 2 compactors. Zero-th compactor prioritizes L0.
NumLevelZeroTables: 5,
NumLevelZeroTablesStall: 15,
NumMemtables: 5,
NumMemtables: 15,
BloomFalsePositive: 0.01,
BlockSize: 4 * 1024,
SyncWrites: false,
Expand Down

0 comments on commit a1e71f3

Please sign in to comment.