combine memtables before flushing to L0

Taken from PR #1696, commit b21f591
dgraph-io · Feb 21, 2023 · cf76b23 · cf76b23
1 parent c65a8ac
commit cf76b23
Showing 1 changed file with 64 additions and 53 deletions.
diff --git a/db.go b/db.go
@@ -109,7 +109,7 @@ type DB struct {
 	lc        *levelsController
 	vlog      valueLog
 	writeCh   chan *request
-	flushChan chan flushTask // For flushing memtables.
+	flushChan chan *memTable // For flushing memtables.
 	closeOnce sync.Once      // For closing DB only once.
 
 	blockWrites int32
@@ -240,7 +240,7 @@ func Open(opt Options) (*DB, error) {
 
 	db := &DB{
 		imm:              make([]*memTable, 0, opt.NumMemtables),
-		flushChan:        make(chan flushTask, opt.NumMemtables),
+		flushChan:        make(chan *memTable, opt.NumMemtables),
 		writeCh:          make(chan *request, kvWriteChCapacity),
 		opt:              opt,
 		manifest:         manifestFile,
@@ -355,7 +355,7 @@ func Open(opt Options) (*DB, error) {
 		}()
 		// Flush them to disk asap.
 		for _, mt := range db.imm {
-			db.flushChan <- flushTask{mt: mt}
+			db.flushChan <- mt
 		}
 	}
 	// We do increment nextTxnTs below. So, no need to do it here.
@@ -568,12 +568,12 @@ func (db *DB) close() (err error) {
 		} else {
 			db.opt.Debugf("Flushing memtable")
 			for {
-				pushedFlushTask := func() bool {
+				pushedMemTable := func() bool {
 					db.lock.Lock()
 					defer db.lock.Unlock()
 					y.AssertTrue(db.mt != nil)
 					select {
-					case db.flushChan <- flushTask{mt: db.mt}:
+					case db.flushChan <- db.mt:
 						db.imm = append(db.imm, db.mt) // Flusher will attempt to remove this from s.imm.
 						db.mt = nil                    // Will segfault if we try writing!
 						db.opt.Debugf("pushed to flush chan\n")
@@ -586,7 +586,7 @@ func (db *DB) close() (err error) {
 					}
 					return false
 				}()
-				if pushedFlushTask {
+				if pushedMemTable {
 					break
 				}
 				time.Sleep(10 * time.Millisecond)
@@ -826,6 +826,7 @@ func (db *DB) writeRequests(reqs []*request) error {
 		}
 		count += len(b.Entries)
 		var i uint64
+		var err error
 		for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
 			i++
 			if i%100 == 0 {
@@ -987,7 +988,7 @@ func (db *DB) ensureRoomForWrite() error {
 	}
 
 	select {
-	case db.flushChan <- flushTask{mt: db.mt}:
+	case db.flushChan <- db.mt:
 		db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n",
 			db.mt.sl.MemSize(), len(db.flushChan))
 		// We manage to push this task. Let's modify imm.
@@ -1009,12 +1010,12 @@ func arenaSize(opt Options) int64 {
 }
 
 // buildL0Table builds a new table from the memtable.
-func buildL0Table(ft flushTask, bopts table.Options) *table.Builder {
-	iter := ft.mt.sl.NewIterator()
+func buildL0Table(iter y.Iterator, dropPrefixes [][]byte, bopts table.Options) *table.Builder {
 	defer iter.Close()
+
 	b := table.NewTableBuilder(bopts)
-	for iter.SeekToFirst(); iter.Valid(); iter.Next() {
-		if len(ft.dropPrefixes) > 0 && hasAnyPrefixes(iter.Key(), ft.dropPrefixes) {
+	for iter.Rewind(); iter.Valid(); iter.Next() {
+		if len(dropPrefixes) > 0 && hasAnyPrefixes(iter.Key(), dropPrefixes) {
 			continue
 		}
 		vs := iter.Value()
@@ -1024,23 +1025,14 @@ func buildL0Table(ft flushTask, bopts table.Options) *table.Builder {
 		}
 		b.Add(iter.Key(), iter.Value(), vp.Len)
 	}
-	return b
-}
 
-type flushTask struct {
-	mt           *memTable
-	dropPrefixes [][]byte
+	return b
 }
 
-// handleFlushTask must be run serially.
-func (db *DB) handleFlushTask(ft flushTask) error {
-	// There can be a scenario, when empty memtable is flushed.
-	if ft.mt.sl.Empty() {
-		return nil
-	}
-
+// handleMemTableFlush must be run serially.
+func (db *DB) handleMemTableFlush(itr y.Iterator, dropPrefixes [][]byte) error {
 	bopts := buildTableOptions(db)
-	builder := buildL0Table(ft, bopts)
+	builder := buildL0Table(itr, nil, bopts)
 	defer builder.Close()
 
 	// buildL0Table can return nil if the none of the items in the skiplist are
@@ -1069,39 +1061,62 @@ func (db *DB) handleFlushTask(ft flushTask) error {
 	return err
 }
 
-// flushMemtable must keep running until we send it an empty flushTask. If there
-// are errors during handling the flush task, we'll retry indefinitely.
+// flushMemtable must keep running until we send it an empty memtable. If there
+// are errors during handling the memtable flush, we'll retry indefinitely.
 func (db *DB) flushMemtable(lc *z.Closer) error {
 	defer lc.Done()
 
-	for ft := range db.flushChan {
-		if ft.mt == nil {
-			// We close db.flushChan now, instead of sending a nil ft.mt.
-			continue
-		}
-		for {
-			err := db.handleFlushTask(ft)
-			if err == nil {
+	var sz int64
+	var itrs []y.Iterator
+	var mts []*memTable
+	for { //nolint:gosimple
+		select {
+		case mt, ok := <-db.flushChan:
+			if mt != nil {
+				itrs = append(itrs, mt.sl.NewUniIterator(false))
+				mts = append(mts, mt)
+				sz += mt.sl.MemSize()
+				if sz < db.opt.MemTableSize {
+					continue
+				}
+			}
+
+			if !ok && len(mts) == 0 {
+				return nil
+			}
+			if len(mts) == 0 {
+				continue
+			}
+
+			mitr := table.NewMergeIterator(itrs, false)
+			for {
+				if err := db.handleMemTableFlush(mitr, nil); err != nil {
+					// Encountered error. Retry indefinitely.
+					db.opt.Errorf("error flushing memtable to disk: %v, retrying", err)
+					time.Sleep(time.Second)
+					continue
+				}
+
 				// Update s.imm. Need a lock.
 				db.lock.Lock()
-				// This is a single-threaded operation. ft.mt corresponds to the head of
-				// db.imm list. Once we flush it, we advance db.imm. The next ft.mt
+				// This is a single-threaded operation. mt corresponds to the head of
+				// db.imm list. Once we flush it, we advance db.imm. The next mt
 				// which would arrive here would match db.imm[0], because we acquire a
 				// lock over DB when pushing to flushChan.
 				// TODO: This logic is dirty AF. Any change and this could easily break.
-				y.AssertTrue(ft.mt == db.imm[0])
-				db.imm = db.imm[1:]
-				ft.mt.DecrRef() // Return memory.
+				for _, mt := range mts {
+					y.AssertTrue(mt == db.imm[0])
+					db.imm = db.imm[1:]
+					mt.DecrRef() // Return memory.
+				}
 				db.lock.Unlock()
-
 				break
 			}
-			// Encountered error. Retry indefinitely.
-			db.opt.Errorf("Failure while flushing memtable to disk: %v. Retrying...\n", err)
-			time.Sleep(time.Second)
+
+			// Reset everything.
+			itrs, mts, sz = itrs[:0], mts[:0], 0
 		}
 	}
-	return nil
 }
 
 func exists(path string) (bool, error) {
@@ -1521,7 +1536,7 @@ func (db *DB) startCompactions() {
 func (db *DB) startMemoryFlush() {
 	// Start memory fluhser.
 	if db.closers.memtable != nil {
-		db.flushChan = make(chan flushTask, db.opt.NumMemtables)
+		db.flushChan = make(chan *memTable, db.opt.NumMemtables)
 		db.closers.memtable = z.NewCloser(1)
 		go func() {
 			_ = db.flushMemtable(db.closers.memtable)
@@ -1627,7 +1642,7 @@ func (db *DB) prepareToDrop() (func(), error) {
 		panic("Attempting to drop data in read-only mode.")
 	}
 	// In order prepare for drop, we need to block the incoming writes and
-	// write it to db. Then, flush all the pending flushtask. So that, we
+	// write it to db. Then, flush all the pending memtable. So that, we
 	// don't miss any entries.
 	if err := db.blockWrite(); err != nil {
 		return nil, err
@@ -1676,7 +1691,7 @@ func (db *DB) dropAll() (func(), error) {
 	if err != nil {
 		return f, err
 	}
-	// prepareToDrop will stop all the incomming write and flushes any pending flush tasks.
+	// prepareToDrop will stop all the incomming write and flushes any pending memtables.
 	// Before we drop, we'll stop the compaction because anyways all the datas are going to
 	// be deleted.
 	db.stopCompactions()
@@ -1758,13 +1773,9 @@ func (db *DB) DropPrefix(prefixes ...[]byte) error {
 			memtable.DecrRef()
 			continue
 		}
-		task := flushTask{
-			mt: memtable,
-			// Ensure that the head of value log gets persisted to disk.
-			dropPrefixes: filtered,
-		}
+		itr := memtable.sl.NewUniIterator(false)
 		db.opt.Debugf("Flushing memtable")
-		if err := db.handleFlushTask(task); err != nil {
+		if err := db.handleMemTableFlush(itr, filtered); err != nil {
 			db.opt.Errorf("While trying to flush memtable: %v", err)
 			return err
 		}