Skip to content

Commit

Permalink
MB-51761: indexer blocked during storage warmup on MOI storage, causi…
Browse files Browse the repository at this point in the history
…ng rebalance failure.

RCA: with MOI indexes if loading snapshot during bootstrap takes time indexer main loop blocks on MsgUpdateSnapMap which is loading the snapshot from disk synchronusly.
Fix: specifically for the MOI load a snapshot in goroutine during bootstrap phase, this avoids the indexer main loop from blocking and rebalance does not fail.
Change-Id: I83ca2a1ead4f3717efb32a5d963d197110e01eeb
  • Loading branch information
Yogendra Acharya committed Apr 21, 2022
1 parent 45d60b6 commit 26b4c5b
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 10 deletions.
29 changes: 22 additions & 7 deletions secondary/indexer/indexer.go
Expand Up @@ -7538,13 +7538,28 @@ func (idx *indexer) initFromPersistedState() error {
continue
}

idx.internalRecvCh <- &MsgUpdateSnapMap{
idxInstId: inst.InstId,
idxInst: inst,
partnMap: partnInstMap,
streamId: common.ALL_STREAMS,
//TODO Collections verify this will work
keyspaceId: "",
if common.GetStorageMode() == common.MOI {
respCh := make(chan bool)

idx.internalRecvCh <- &MsgUpdateSnapMap{
idxInstId: inst.InstId,
idxInst: inst,
partnMap: partnInstMap,
streamId: common.ALL_STREAMS,
keyspaceId: "",
respch: respCh,
}
<-respCh

} else {
idx.internalRecvCh <- &MsgUpdateSnapMap{
idxInstId: inst.InstId,
idxInst: inst,
partnMap: partnInstMap,
streamId: common.ALL_STREAMS,
//TODO Collections verify this will work
keyspaceId: "",
}
}

idx.initializeBootstrapStats(bootstrapStats, inst.InstId)
Expand Down
4 changes: 4 additions & 0 deletions secondary/indexer/message.go
Expand Up @@ -1816,6 +1816,7 @@ type MsgUpdateSnapMap struct {
partnMap PartitionInstMap
streamId common.StreamId
keyspaceId string
respch chan bool
}

func (m *MsgUpdateSnapMap) GetMsgType() MsgType {
Expand All @@ -1838,6 +1839,9 @@ func (m *MsgUpdateSnapMap) GetStreamId() common.StreamId {
func (m *MsgUpdateSnapMap) GetKeyspaceId() string {
return m.keyspaceId
}
func (m *MsgUpdateSnapMap) GetReplyChannel() chan bool {
return m.respch
}

type MsgIndexStorageStats struct {
respch chan []IndexStorageStats
Expand Down
21 changes: 18 additions & 3 deletions secondary/indexer/storage_manager.go
Expand Up @@ -2144,11 +2144,26 @@ func (s *storageMgr) handleUpdateIndexSnapMapForIndex(cmd Message) {
partnMap := req.GetPartnMap()
streamId := req.GetStreamId()
keyspaceId := req.GetKeyspaceId()
replyCh := req.GetReplyChannel()

s.muSnap.Lock()
s.updateIndexSnapMapForIndex(idxInstId, idxInst, partnMap, streamId, keyspaceId)
s.muSnap.Unlock()
f := func() {
s.muSnap.Lock()
s.updateIndexSnapMapForIndex(idxInstId, idxInst, partnMap, streamId, keyspaceId)
s.muSnap.Unlock()
if replyCh != nil {
replyCh <- true
}
}

if replyCh != nil && (common.GetStorageMode() == common.MOI) {
// make updateIndexSnapMapForIndex async for MOI during bootstrap phase
// because it tries to load index from disk which can take lot of time (sometimes in 10s of mins)
// this will cause indexer to block and introduce failures such as rebalance failure.
// replyChan is only set during bootstrap phase.
go f()
} else {
f()
}
s.supvCmdch <- &MsgSuccess{}
}

Expand Down

0 comments on commit 26b4c5b

Please sign in to comment.