Skip to content

Commit 9c05157

Browse files
authored
Support lock-wait with --lock-retry in restic 0.16 (#240)
* Support lock-wait with --lock-retry in restic 0.16 * Simplified logic and relaxed min stale-lock age * Supporting custom overridden "lock-retry" values * Improved logging and aligned documentation
1 parent 6cbfdb3 commit 9c05157

File tree

11 files changed

+217
-67
lines changed

11 files changed

+217
-67
lines changed

.mockery.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,7 @@ packages:
1717
github.com/creativeprojects/resticprofile/schedule:
1818
interfaces:
1919
Handler:
20+
21+
github.com/creativeprojects/resticprofile/monitor:
22+
interfaces:
23+
OutputAnalysis:

config/global.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ type Global struct {
1919
ResticVersion string // not configurable at the moment. To be set after ResticBinary is known.
2020
FilterResticFlags bool `mapstructure:"restic-arguments-filter" default:"true" description:"Remove unknown flags instead of passing all configured flags to restic"`
2121
ResticLockRetryAfter time.Duration `mapstructure:"restic-lock-retry-after" default:"1m" description:"Time to wait before trying to get a lock on a restic repositoey - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
22-
ResticStaleLockAge time.Duration `mapstructure:"restic-stale-lock-age" default:"2h" description:"The age an unused lock on a restic repository must have at least before resiticprofile attempts to unlock - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
22+
ResticStaleLockAge time.Duration `mapstructure:"restic-stale-lock-age" default:"1h" description:"The age an unused lock on a restic repository must have at least before resiticprofile attempts to unlock - see https://creativeprojects.github.io/resticprofile/usage/locks/"`
2323
ShellBinary []string `mapstructure:"shell" default:"auto" examples:"sh;bash;pwsh;powershell;cmd" description:"The shell that is used to run commands (default is OS specific)"`
2424
MinMemory uint64 `mapstructure:"min-memory" default:"100" description:"Minimum available memory (in MB) required to run any commands - see https://creativeprojects.github.io/resticprofile/usage/memory/"`
2525
Scheduler string `mapstructure:"scheduler" description:"Leave blank for the default scheduler or use \"crond\" to select cron on supported operating systems"`

constants/default.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ const (
99
DefaultCommand = "snapshots"
1010
DefaultFilterResticFlags = true
1111
DefaultResticLockRetryAfter = 60 * time.Second
12-
DefaultResticStaleLockAge = 2 * time.Hour
12+
DefaultResticStaleLockAge = 1 * time.Hour
1313
DefaultTheme = "light"
1414
DefaultIONiceFlag = false
1515
DefaultIONiceClass = 2

constants/global.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,10 @@ var (
2828

2929
// Limits for restic lock handling (stale locks and retry on lock failure)
3030
const (
31-
MinResticLockRetryTime = 15 * time.Second
32-
MaxResticLockRetryTime = 30 * time.Minute
33-
MinResticStaleLockAge = 1 * time.Hour
31+
MinResticLockRetryDelay = 15 * time.Second
32+
MaxResticLockRetryDelay = 30 * time.Minute
33+
MaxResticLockRetryTimeArgument = 10 * time.Minute
34+
MinResticStaleLockAge = 15 * time.Minute
3435
)
3536

3637
// Schedule lock mode config options

constants/parameter.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
ParameterVerbose = "verbose"
2020
ParameterDescription = "description"
2121
ParameterVersion = "version"
22+
ParameterRetryLock = "retry-lock"
2223
ParameterRepository = "repo"
2324
ParameterRepositoryFile = "repository-file"
2425
ParameterPasswordFile = "password-file"

docs/content/usage/locks/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ In some cases, resticprofile as well as restic may leave a lock behind if the pr
121121

122122
For that matter, if you add the flag `force-inactive-lock` to your profile, resticprofile will detect and remove stale locks:
123123
* **resticprofile locks**: Check for the presence of a process with the PID indicated in the lockfile. If it can't find any, it will try to delete the lock and continue the operation (locking again, running profile and so on...)
124-
* **restic locks**: Evaluate if a restic command failed on acquiring a lock. If the lock is older than `restic-stale-lock-age`, invoke `restic unlock` and retry the command that failed (can be disabled by setting `restic-stale-lock-age` to 0, default is 2h).
124+
* **restic locks**: Evaluate if a restic command failed on acquiring a lock. If the lock is older than `restic-stale-lock-age`, invoke `restic unlock` and retry the command that failed (can be disabled by setting `restic-stale-lock-age` to 0, default is 1h).
125125

126126
{{< tabs groupId="config-with-json" >}}
127127
{{% tab name="toml" %}}

monitor/summary.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ type OutputAnalysis interface {
2626
// If no remote lock is held or the time cannot be determined, the second parameter is false.
2727
GetRemoteLockedSince() (time.Duration, bool)
2828

29+
// GetRemoteLockedMaxWait returns the max time duration that restic waited for the lock to acquire.
30+
// If no remote lock is held or the time cannot be determined, the second parameter is false.
31+
GetRemoteLockedMaxWait() (time.Duration, bool)
32+
2933
// GetRemoteLockedBy returns who locked the remote lock, if available.
3034
GetRemoteLockedBy() (string, bool)
3135
}

shell/analyser.go

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,10 @@ type OutputAnalyser struct {
3636
}
3737

3838
var outputAnalyserPatterns = map[string]*regexp.Regexp{
39-
"lock-failure,who": regexp.MustCompile("unable to create lock.+already locked.+?by (.+)$"),
40-
"lock-failure,age": regexp.MustCompile("lock was created at.+\\(([^()]+)\\s+ago\\)"),
41-
"lock-failure,stale": regexp.MustCompile("the\\W+unlock\\W+command can be used to remove stale locks"),
39+
"lock-failure,who": regexp.MustCompile("unable to create lock.+already locked.+?by (.+)$"),
40+
"lock-failure,age": regexp.MustCompile("lock was created at.+\\(([^()]+)\\s+ago\\)"),
41+
"lock-failure,stale": regexp.MustCompile("the\\W+unlock\\W+command can be used to remove stale locks"),
42+
"lock-retry,max-wait": regexp.MustCompile("repo already locked, waiting up to (\\S+) for the lock"),
4243
}
4344

4445
func NewOutputAnalyser() *OutputAnalyser {
@@ -99,7 +100,7 @@ func (a *OutputAnalyser) SetCallback(name, pattern string, minCount, maxCalls in
99100
return nil
100101
}
101102

102-
func (a OutputAnalyser) invokeCallback(name, line string) (err error) {
103+
func (a *OutputAnalyser) invokeCallback(name, line string) (err error) {
103104
count, hasCount := a.counts[name]
104105
cb, hasCallback := a.callbacks[name]
105106

@@ -124,7 +125,7 @@ func (a OutputAnalyser) invokeCallback(name, line string) (err error) {
124125
return
125126
}
126127

127-
func (a OutputAnalyser) ContainsRemoteLockFailure() bool {
128+
func (a *OutputAnalyser) ContainsRemoteLockFailure() bool {
128129
a.lock.Lock()
129130
defer a.lock.Unlock()
130131

@@ -134,7 +135,7 @@ func (a OutputAnalyser) ContainsRemoteLockFailure() bool {
134135
return false
135136
}
136137

137-
func (a OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
138+
func (a *OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
138139
a.lock.Lock()
139140
defer a.lock.Unlock()
140141

@@ -149,7 +150,22 @@ func (a OutputAnalyser) GetRemoteLockedSince() (time.Duration, bool) {
149150
return 0, false
150151
}
151152

152-
func (a OutputAnalyser) GetRemoteLockedBy() (string, bool) {
153+
func (a *OutputAnalyser) GetRemoteLockedMaxWait() (time.Duration, bool) {
154+
a.lock.Lock()
155+
defer a.lock.Unlock()
156+
157+
if m, ok := a.matches["lock-retry,max-wait"]; ok && len(m) > 1 {
158+
if maxWait, err := time.ParseDuration(m[1]); err == nil {
159+
return maxWait.Truncate(time.Millisecond), true
160+
} else {
161+
clog.Debugf("failed parsing restic retry-lock max wait. Cause %s", err.Error())
162+
}
163+
}
164+
165+
return 0, false
166+
}
167+
168+
func (a *OutputAnalyser) GetRemoteLockedBy() (string, bool) {
153169
a.lock.Lock()
154170
defer a.lock.Unlock()
155171

@@ -160,11 +176,11 @@ func (a OutputAnalyser) GetRemoteLockedBy() (string, bool) {
160176
return "", false
161177
}
162178

163-
func (a OutputAnalyser) AnalyseStringLines(output string) error {
179+
func (a *OutputAnalyser) AnalyseStringLines(output string) error {
164180
return a.AnalyseLines(strings.NewReader(output))
165181
}
166182

167-
func (a OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
183+
func (a *OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
168184
a.lock.Lock()
169185
defer a.lock.Unlock()
170186

@@ -183,7 +199,7 @@ func (a OutputAnalyser) AnalyseLines(output io.Reader) (err error) {
183199
return
184200
}
185201

186-
func (a OutputAnalyser) analyseLine(line string) (err error) {
202+
func (a *OutputAnalyser) analyseLine(line string) (err error) {
187203
for _, pattern := range a.patterns {
188204
match := pattern.expression.FindStringSubmatch(line)
189205
if match != nil {

shell/analyser_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
)
1111

1212
const ResticLockFailureOutput = `
13+
repo already locked, waiting up to 2m1s115.3ms for the lock
1314
unable to create lock in backend: repository is already locked by PID 27153 on app-server-01 by root (UID 0, GID 0)
1415
lock was created at 2021-04-17 19:00:16 (1s727.5ms ago)
1516
storage ID 870530a4
@@ -33,6 +34,12 @@ func TestRemoteLockFailure(t *testing.T) {
3334
assert.Equal(t, true, ok)
3435
assert.Equal(t, "PID 27153 on app-server-01 by root (UID 0, GID 0)", name)
3536
})
37+
38+
t.Run("GetRemoteLockedMaxWait", func(t *testing.T) {
39+
wait, ok := analysis.GetRemoteLockedMaxWait()
40+
assert.Equal(t, true, ok)
41+
assert.Equal(t, 2*time.Minute+time.Second+(115*time.Millisecond), wait)
42+
})
3643
}
3744

3845
func TestCustomErrorCallback(t *testing.T) {

wrapper.go

Lines changed: 59 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,13 @@ func (r *resticWrapper) getCommandArgumentsFilter(command string) argumentsFilte
346346
return nil
347347
}
348348

349+
func (r *resticWrapper) containsArguments(arguments []string, subset ...string) (found bool) {
350+
filter := r.validArgumentsFilter(subset)
351+
argMatcher := func(arg string) bool { return strings.HasPrefix(arg, "-") }
352+
found = slices.ContainsFunc(filter(arguments, true), argMatcher)
353+
return
354+
}
355+
349356
func (r *resticWrapper) prepareCommand(command string, args *shell.Args, allowExtraValues bool) shellCommandDefinition {
350357
// Create local instance to allow modification
351358
args = args.Clone()
@@ -365,6 +372,20 @@ func (r *resticWrapper) prepareCommand(command string, args *shell.Args, allowEx
365372
args.AddArgs(r.profile.GetBackupSource(), shell.ArgConfigBackupSource)
366373
}
367374

375+
// Add retry-lock (supported from restic 0.16, depends on filter being enabled)
376+
if lockRetryTime, enabled := r.remainingLockRetryTime(); enabled && filter != nil {
377+
// limiting the retry handling in restic, we need to make sure we can retry internally so that unlock is called.
378+
lockRetryTime = lockRetryTime - r.global.ResticLockRetryAfter - constants.MinResticLockRetryDelay
379+
if lockRetryTime > constants.MaxResticLockRetryTimeArgument {
380+
lockRetryTime = constants.MaxResticLockRetryTimeArgument
381+
}
382+
lockRetryTime = lockRetryTime.Truncate(time.Minute)
383+
384+
if lockRetryTime > 0 && !r.containsArguments(args.GetAll(), fmt.Sprintf("--%s", constants.ParameterRetryLock)) {
385+
args.AddFlag(constants.ParameterRetryLock, fmt.Sprintf("%.0fm", lockRetryTime.Minutes()), shell.ArgConfigEscape)
386+
}
387+
}
388+
368389
// Build arguments and publicArguments (for logging)
369390
arguments, publicArguments := args.GetAll(), config.GetNonConfidentialArgs(r.profile, args).GetAll()
370391
if filter != nil {
@@ -732,7 +753,16 @@ func (r *resticWrapper) canRetryAfterError(command string, summary monitor.Summa
732753
output := summary.OutputAnalysis
733754

734755
if output != nil && output.ContainsRemoteLockFailure() {
735-
clog.Debugf("repository lock failed when running '%s'", command)
756+
// Do not count lock-wait time as normal execution time (to calc correct remaining lock-wait time)
757+
if maxWait, ok := output.GetRemoteLockedMaxWait(); ok {
758+
r.executionTime -= maxWait
759+
} else {
760+
r.executionTime -= summary.Duration
761+
}
762+
if r.executionTime < 0 {
763+
r.executionTime = 0
764+
}
765+
clog.Debugf("repository lock failed when running '%s', counted execution time %s", command, r.executionTime.Truncate(time.Second))
736766
retry, sleep = r.canRetryAfterRemoteLockFailure(output)
737767
}
738768

@@ -788,28 +818,26 @@ func (r *resticWrapper) canRetryAfterRemoteLockFailure(output monitor.OutputAnal
788818
}
789819

790820
// Check if we have time left to wait on a non-stale lock
791-
retryDelay := r.global.ResticLockRetryAfter
792-
793-
if r.lockWait != nil && retryDelay > 0 {
794-
elapsedTime := time.Since(r.startTime)
795-
availableTime := *r.lockWait - elapsedTime + r.executionTime
796-
797-
if retryDelay < constants.MinResticLockRetryTime {
798-
retryDelay = constants.MinResticLockRetryTime
799-
} else if retryDelay > constants.MaxResticLockRetryTime {
800-
retryDelay = constants.MaxResticLockRetryTime
821+
if remainingTime, enabled := r.remainingLockRetryTime(); enabled && remainingTime > 0 {
822+
retryDelay := r.global.ResticLockRetryAfter
823+
if retryDelay < constants.MinResticLockRetryDelay {
824+
retryDelay = constants.MinResticLockRetryDelay
825+
} else if retryDelay > constants.MaxResticLockRetryDelay {
826+
retryDelay = constants.MaxResticLockRetryDelay
801827
}
802828

803-
if retryDelay > availableTime {
804-
retryDelay = availableTime
829+
if retryDelay > remainingTime {
830+
retryDelay = remainingTime
805831
}
806832

807-
if retryDelay >= constants.MinResticLockRetryTime {
833+
if retryDelay >= constants.MinResticLockRetryDelay {
808834
lockName := r.profile.Repository.String()
809835
if lockedBy, ok := output.GetRemoteLockedBy(); ok {
810836
lockName = fmt.Sprintf("%s locked by %s", lockName, lockedBy)
811837
}
812-
logLockWait(lockName, r.startTime, time.Unix(0, 0), *r.lockWait)
838+
if r.lockWait != nil {
839+
logLockWait(lockName, r.startTime, time.Unix(0, 0), r.executionTime, *r.lockWait)
840+
}
813841

814842
return true, retryDelay
815843
}
@@ -819,6 +847,18 @@ func (r *resticWrapper) canRetryAfterRemoteLockFailure(output monitor.OutputAnal
819847
return false, 0
820848
}
821849

850+
func (r *resticWrapper) remainingLockRetryTime() (remaining time.Duration, enabled bool) {
851+
enabled = r.global.ResticLockRetryAfter > 0 && r.lockWait != nil
852+
if enabled {
853+
elapsedTime := time.Since(r.startTime)
854+
remaining = *r.lockWait - elapsedTime + r.executionTime
855+
}
856+
if remaining < 0 {
857+
remaining = 0
858+
}
859+
return
860+
}
861+
822862
// lockRun is making sure the function is only run once by putting a lockfile on the disk
823863
func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setPID lock.SetPID) error) error {
824864
// No lock
@@ -873,7 +913,7 @@ func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setP
873913
}
874914
if time.Since(start) < *lockWait {
875915
lockName := fmt.Sprintf("%s locked by %s", lockFile, locker)
876-
lockWaitLogged = logLockWait(lockName, start, lockWaitLogged, *lockWait)
916+
lockWaitLogged = logLockWait(lockName, start, lockWaitLogged, 0, *lockWait)
877917

878918
sleep := 3 * time.Second
879919
if sleep > *lockWait {
@@ -894,15 +934,16 @@ func lockRun(lockFile string, force bool, lockWait *time.Duration, run func(setP
894934

895935
const logLockWaitEvery = 5 * time.Minute
896936

897-
func logLockWait(lockName string, started, lastLogged time.Time, maxLockWait time.Duration) time.Time {
937+
func logLockWait(lockName string, started, lastLogged time.Time, executed, maxLockWait time.Duration) time.Time {
898938
now := time.Now()
899939
lastLog := now.Sub(lastLogged)
900940
elapsed := now.Sub(started).Truncate(time.Second)
941+
waited := (elapsed - executed).Truncate(time.Second)
901942
remaining := (maxLockWait - elapsed).Truncate(time.Second)
902943

903944
if lastLog > logLockWaitEvery {
904945
if elapsed > logLockWaitEvery {
905-
clog.Infof("lock wait (remaining %s / elapsed %s): %s", remaining, elapsed, strings.TrimSpace(lockName))
946+
clog.Infof("lock wait (remaining %s / waited %s / elapsed %s): %s", remaining, waited, elapsed, strings.TrimSpace(lockName))
906947
} else {
907948
clog.Infof("lock wait (remaining %s): %s", remaining, strings.TrimSpace(lockName))
908949
}

0 commit comments

Comments
 (0)