Skip to content

Commit

Permalink
feat: add sync piece watchdog (#1272)
Browse files Browse the repository at this point in the history
* feat: add sync piece watchdog

Signed-off-by: Jim Ma <majinjing3@gmail.com>
  • Loading branch information
jim3ma authored and gaius-qi committed May 24, 2022
1 parent 21ca12a commit 3e57f61
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 5 deletions.
1 change: 1 addition & 0 deletions client/config/peerhost.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ type DownloadOption struct {
TransportOption *TransportOption `mapstructure:"transportOption" yaml:"transportOption"`
GetPiecesMaxRetry int `mapstructure:"getPiecesMaxRetry" yaml:"getPiecesMaxRetry"`
Prefetch bool `mapstructure:"prefetch" yaml:"prefetch"`
WatchdogTimeout time.Duration `mapstructure:"watchdogTimeout" yaml:"watchdogTimeout"`
}

type TransportOption struct {
Expand Down
3 changes: 2 additions & 1 deletion client/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ func New(opt *config.DaemonOption, d dfpath.Dfpath) (Daemon, error) {
return nil, err
}
peerTaskManager, err := peer.NewPeerTaskManager(host, pieceManager, storageManager, sched, opt.Scheduler,
opt.Download.PerPeerRateLimit.Limit, opt.Storage.Multiplex, opt.Download.Prefetch, opt.Download.CalculateDigest, opt.Download.GetPiecesMaxRetry)
opt.Download.PerPeerRateLimit.Limit, opt.Storage.Multiplex, opt.Download.Prefetch, opt.Download.CalculateDigest,
opt.Download.GetPiecesMaxRetry, opt.Download.WatchdogTimeout)
if err != nil {
return nil, err
}
Expand Down
2 changes: 2 additions & 0 deletions client/daemon/peer/peertask_conductor.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ var _ Task = (*peerTaskConductor)(nil)
// peerTaskConductor will fetch all pieces from other peers and send pieces info to broker
type peerTaskConductor struct {
*logger.SugaredLoggerOnWith
ptm *peerTaskManager
// ctx is with span info for tracing
// we use successCh and failCh mark task success or fail
ctx context.Context
Expand Down Expand Up @@ -202,6 +203,7 @@ func (ptm *peerTaskManager) newPeerTaskConductor(
span.SetAttributes(config.AttributeTaskID.String(taskID))

ptc := &peerTaskConductor{
ptm: ptm,
startTime: time.Now(),
ctx: ctx,
broker: newPieceBroker(),
Expand Down
7 changes: 6 additions & 1 deletion client/daemon/peer/peertask_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"io"
"sync"
"time"

"github.com/go-http-utils/headers"
"github.com/pkg/errors"
Expand Down Expand Up @@ -130,6 +131,8 @@ type peerTaskManager struct {
enableMultiplex bool
// enablePrefetch indicates to prefetch the whole files of ranged requests
enablePrefetch bool
// watchdogTimeout > 0 indicates to start watch dog for every single peer task
watchdogTimeout time.Duration

calculateDigest bool

Expand All @@ -146,7 +149,8 @@ func NewPeerTaskManager(
multiplex bool,
prefetch bool,
calculateDigest bool,
getPiecesMaxRetry int) (TaskManager, error) {
getPiecesMaxRetry int,
watchdog time.Duration) (TaskManager, error) {

ptm := &peerTaskManager{
host: host,
Expand All @@ -159,6 +163,7 @@ func NewPeerTaskManager(
perPeerRateLimit: perPeerRateLimit,
enableMultiplex: multiplex,
enablePrefetch: prefetch,
watchdogTimeout: watchdog,
calculateDigest: calculateDigest,
getPiecesMaxRetry: getPiecesMaxRetry,
}
Expand Down
68 changes: 65 additions & 3 deletions client/daemon/peer/peertask_piecetask_synchronizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"io"
"sync"
"time"

"github.com/pkg/errors"
"go.opentelemetry.io/otel/trace"
Expand All @@ -43,6 +44,7 @@ type pieceTaskSyncManager struct {
peerTaskConductor *peerTaskConductor
pieceRequestCh chan *DownloadPieceRequest
workers map[string]*pieceTaskSynchronizer
watchdog *synchronizerWatchdog
}

type pieceTaskSynchronizer struct {
Expand All @@ -55,6 +57,13 @@ type pieceTaskSynchronizer struct {
pieceRequestCh chan *DownloadPieceRequest
}

type synchronizerWatchdog struct {
done chan struct{}
mainPeer atomic.Value // save *scheduler.PeerPacket_DestPeer
syncSuccess *atomic.Bool
peerTaskConductor *peerTaskConductor
}

type pieceTaskSynchronizerError struct {
err error
}
Expand Down Expand Up @@ -133,12 +142,12 @@ func (s *pieceTaskSyncManager) cleanStaleWorker(destPeers []*scheduler.PeerPacke
func (s *pieceTaskSyncManager) newPieceTaskSynchronizer(
ctx context.Context,
dstPeer *scheduler.PeerPacket_DestPeer,
lastNum int32) error {
desiredPiece int32) error {
request := &base.PieceTaskRequest{
TaskId: s.peerTaskConductor.taskID,
SrcPid: s.peerTaskConductor.peerID,
DstPid: dstPeer.PeerId,
StartNum: uint32(lastNum),
StartNum: uint32(desiredPiece),
Limit: 16,
}
if worker, ok := s.workers[dstPeer.PeerId]; ok {
Expand Down Expand Up @@ -191,7 +200,13 @@ func (s *pieceTaskSyncManager) newMultiPieceTaskSynchronizer(
destPeers []*scheduler.PeerPacket_DestPeer,
desiredPiece int32) (legacyPeers []*scheduler.PeerPacket_DestPeer) {
s.Lock()
defer s.Unlock()
defer func() {
if s.peerTaskConductor.ptm.watchdogTimeout > 0 {
s.resetWatchdog(destPeers[0])
}
s.Unlock()
}()

for _, peer := range destPeers {
err := s.newPieceTaskSynchronizer(s.ctx, peer, desiredPiece)
if err == nil {
Expand Down Expand Up @@ -222,6 +237,22 @@ func (s *pieceTaskSyncManager) newMultiPieceTaskSynchronizer(
return legacyPeers
}

func (s *pieceTaskSyncManager) resetWatchdog(mainPeer *scheduler.PeerPacket_DestPeer) {
if s.watchdog != nil {
close(s.watchdog.done)
s.peerTaskConductor.Debugf("close old watchdog")
}
s.watchdog = &synchronizerWatchdog{
done: make(chan struct{}),
mainPeer: atomic.Value{},
syncSuccess: atomic.NewBool(false),
peerTaskConductor: s.peerTaskConductor,
}
s.watchdog.mainPeer.Store(mainPeer)
s.peerTaskConductor.Infof("start new watchdog")
go s.watchdog.watch(s.peerTaskConductor.ptm.watchdogTimeout)
}

func compositePieceResult(peerTaskConductor *peerTaskConductor, destPeer *scheduler.PeerPacket_DestPeer, code base.Code) *scheduler.PieceResult {
return &scheduler.PieceResult{
TaskId: peerTaskConductor.taskID,
Expand Down Expand Up @@ -384,3 +415,34 @@ func (s *pieceTaskSynchronizer) canceled(err error) bool {
}
return false
}

func (s *synchronizerWatchdog) watch(timeout time.Duration) {
select {
case <-time.After(timeout):
if s.peerTaskConductor.readyPieces.Settled() == 0 {
s.peerTaskConductor.Warnf("watch sync pieces timeout, may be a bug, " +
"please file a issue in https://github.com/dragonflyoss/Dragonfly2/issues")
s.syncSuccess.Store(false)
s.reportWatchFailed()
} else {
s.peerTaskConductor.Infof("watch sync pieces ok")
}
case <-s.peerTaskConductor.successCh:
s.peerTaskConductor.Debugf("peer task success, watchdog exit")
case <-s.peerTaskConductor.failCh:
s.peerTaskConductor.Debugf("peer task fail, watchdog exit")
case <-s.done:
s.peerTaskConductor.Debugf("watchdog done, exit")
}
}

func (s *synchronizerWatchdog) reportWatchFailed() {
sendError := s.peerTaskConductor.sendPieceResult(compositePieceResult(
s.peerTaskConductor, s.mainPeer.Load().(*scheduler.PeerPacket_DestPeer), base.Code_ClientPieceRequestFail))
if sendError != nil {
s.peerTaskConductor.Errorf("watchdog sync piece info failed and send piece result with error: %s", sendError)
go s.peerTaskConductor.cancel(base.Code_SchedError, sendError.Error())
} else {
s.peerTaskConductor.Debugf("report watchdog sync piece error to scheduler")
}
}
91 changes: 91 additions & 0 deletions client/daemon/peer/peertask_piecetask_synchronizer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright 2022 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package peer

import (
"sync"
"testing"
"time"

"github.com/golang/mock/gomock"
testifyassert "github.com/stretchr/testify/assert"
"go.uber.org/atomic"

logger "d7y.io/dragonfly/v2/internal/dflog"
"d7y.io/dragonfly/v2/pkg/rpc/scheduler"
"d7y.io/dragonfly/v2/pkg/rpc/scheduler/client/mocks"
)

func Test_watchdog(t *testing.T) {
ctrl := gomock.NewController(t)
assert := testifyassert.New(t)

var testCases = []struct {
name string
timeout time.Duration
ok bool
}{
{
name: "watchdog ok",
timeout: time.Millisecond,
ok: true,
},
{
name: "watchdog failed",
timeout: time.Millisecond,
ok: false,
},
}

for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
peer := &scheduler.PeerPacket_DestPeer{}
pps := mocks.NewMockPeerPacketStream(ctrl)
watchdog := &synchronizerWatchdog{
done: make(chan struct{}),
mainPeer: atomic.Value{},
syncSuccess: atomic.NewBool(false),
peerTaskConductor: &peerTaskConductor{
SugaredLoggerOnWith: logger.With(
"peer", "test",
"task", "test",
"component", "PeerTask"),
readyPieces: NewBitmap(),
peerPacketStream: pps,
},
}
if tt.ok {
watchdog.peerTaskConductor.readyPieces.Set(0)
} else {
pps.EXPECT().Send(gomock.Any()).DoAndReturn(func(pr *scheduler.PieceResult) error {
assert.Equal(peer.PeerId, pr.DstPid)
return nil
})
}
watchdog.mainPeer.Store(peer)

wg := sync.WaitGroup{}
wg.Add(1)
go func() {
watchdog.watch(tt.timeout)
wg.Done()
}()

wg.Wait()
})
}
}

0 comments on commit 3e57f61

Please sign in to comment.