Skip to content

Commit

Permalink
runtime/v2: net.Dial gRPC shim sockets before trying grpc
Browse files Browse the repository at this point in the history
This is mostly to workaround an issue with gRPC based shims after containerd
restart. If a shim dies while containerd is also down/restarting, on reboot
grpc.DialContext with our current set of DialOptions will make us wait for
100 seconds per shim even if the socket no longer exists or has no listener.

Signed-off-by: Danny Canter <danny@dcantah.dev>
(cherry picked from commit 0bc9633)
Signed-off-by: Danny Canter <danny@dcantah.dev>
  • Loading branch information
dcantah committed Dec 3, 2023
1 parent 096a0e8 commit 80f96cd
Showing 1 changed file with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions runtime/v2/shim.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -253,7 +254,7 @@ func makeConnection(ctx context.Context, params client.BootstrapParams, onClose
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
}
return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...)
return grpcDialContext(ctx, params.Address, onClose, gopts...)
default:
return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
}
Expand All @@ -264,10 +265,29 @@ func makeConnection(ctx context.Context, params client.BootstrapParams, onClose
// a callback run when the connection is severed or explicitly closed.
func grpcDialContext(
ctx context.Context,
target string,
address string,
onClose func(),
gopts ...grpc.DialOption,
) (*grpcConn, error) {
// If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
// a connection regardless of if the socket exists or has a listener when Dial begins. This
// specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
// there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
// also undesirable as we don't want the background connection behavior, we want to ensure
// a connection before moving on. To bring this in line with the ttrpc connection behavior
// lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
// here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
// ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
//
// This is especially helpful on restart of containerd as if the shim died while containerd
// was down, we end up waiting the full timeout.
conn, err := net.DialTimeout("unix", address, time.Second*10)
if err != nil {
return nil, err
}
conn.Close()

target := dialer.DialAddress(address)
client, err := grpc.DialContext(ctx, target, gopts...)
if err != nil {
return nil, fmt.Errorf("failed to create GRPC connection: %w", err)
Expand Down

0 comments on commit 80f96cd

Please sign in to comment.