Skip to content

Commit

Permalink
Wait a bit for Shutdown to finish on SIGTERM (#39211)
Browse files Browse the repository at this point in the history
  • Loading branch information
espadolini committed Mar 11, 2024
1 parent cbfcd73 commit c695454
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 103 deletions.
61 changes: 25 additions & 36 deletions lib/service/signals.go
Expand Up @@ -40,6 +40,16 @@ import (
"github.com/gravitational/teleport/lib/utils"
)

const (
// fastShutdownTimeout is how long we're going to wait before connections
// are forcibly terminated during a fast shutdown.
fastShutdownTimeout = time.Second * 3

// fastShutdownGrace is how long we're going to wait for the shutdown
// procedure to complete after the fastShutdownTimeout is hit.
fastShutdownGrace = time.Second * 2
)

// printShutdownStatus prints running services until shut down
func (process *TeleportProcess) printShutdownStatus(ctx context.Context) {
statusInterval := defaults.HighResPollingPeriod
Expand Down Expand Up @@ -91,19 +101,25 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error {
process.log.Infof("All services stopped, exiting.")
return nil
case syscall.SIGTERM, syscall.SIGINT:
timeout := getShutdownTimeout(process.log)
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
process.log.Infof("Got signal %q, exiting within %vs.", signal, timeout.Seconds())
// we run the shutdown in a goroutine and return when the
// context is done even if Shutdown hasn't returned because we
// want to ensure that we exit shortly after SIGTERM even in
// case of bugs
process.log.Infof("Got signal %q, exiting within %s.", signal, fastShutdownTimeout)
// we run the shutdown in a goroutine so we can return and exit
// the process even if Shutdown takes longer to return than we
// expected (due to bugs, for example)
shutdownDone := make(chan struct{})
go func() {
defer close(shutdownDone)
timeoutCtx, cancel := context.WithTimeout(ctx, fastShutdownTimeout)
defer cancel()
process.Shutdown(timeoutCtx)
}()
<-timeoutCtx.Done()
process.log.Infof("All services stopped or timeout passed, exiting immediately.")
graceTimer := time.NewTimer(fastShutdownTimeout + fastShutdownGrace)
defer graceTimer.Stop()
select {
case <-graceTimer.C:
process.log.Warn("Shutdown still hasn't completed, exiting anyway.")
case <-shutdownDone:
process.log.Info("All services stopped, exiting.")
}
return nil
case syscall.SIGUSR1:
// All programs placed diagnostics on the standard output.
Expand Down Expand Up @@ -180,33 +196,6 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error {
}
}

const (
defaultShutdownTimeout = time.Second * 3
maxShutdownTimeout = time.Minute * 10
)

func getShutdownTimeout(log logrus.FieldLogger) time.Duration {
timeout := defaultShutdownTimeout

// read undocumented env var TELEPORT_UNSTABLE_SHUTDOWN_TIMEOUT.
// TODO(Tener): DELETE IN 15.0. after ironing out all possible shutdown bugs.
override := os.Getenv("TELEPORT_UNSTABLE_SHUTDOWN_TIMEOUT")
if override != "" {
t, err := time.ParseDuration(override)
if err != nil {
log.Warnf("Cannot parse timeout override %q, using default instead.", override)
}
if err == nil {
if t > maxShutdownTimeout {
log.Warnf("Timeout override %q exceeds maximum value, reducing.", override)
t = maxShutdownTimeout
}
timeout = t
}
}
return timeout
}

// ErrTeleportReloading is returned when signal waiter exits
// because the teleport process has initiaded shutdown
var ErrTeleportReloading = &trace.CompareFailedError{Message: "teleport process is reloading"}
Expand Down
67 changes: 0 additions & 67 deletions lib/service/signals_test.go

This file was deleted.

0 comments on commit c695454

Please sign in to comment.