Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a retry mechanism for netlink calls in the vxlan backend #1718

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -36,6 +36,7 @@ require (
)

require (
github.com/avast/retry-go/v4 v4.3.2
github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common v1.0.464
github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/vpc v1.0.464
go.etcd.io/etcd/client/v3 v3.5.4
Expand Down
9 changes: 8 additions & 1 deletion go.sum
Expand Up @@ -92,6 +92,8 @@ github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5
github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/avast/retry-go/v4 v4.3.2 h1:x4sTEu3jSwr7zNjya8NTdIN+U88u/jtO/q3OupBoDtM=
github.com/avast/retry-go/v4 v4.3.2/go.mod h1:rg6XFaiuFYII0Xu3RDbZQkxCofFwruZKW8oEF1jpWiU=
github.com/aws/aws-sdk-go v1.15.11/go.mod h1:mFuSZ37Z9YOHbQEwBWztmVzqXrEkub65tZoCYDt7FT0=
github.com/benbjohnson/clock v1.0.3 h1:vkLuvpK4fmtSCuo60+yC63p7y0BmQ8gm5ZXGuBCJyXg=
github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM=
Expand Down Expand Up @@ -767,14 +769,19 @@ github.com/stretchr/objx v0.0.0-20180129172003-8a3f7159479f/go.mod h1:HFkY916IF+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v0.0.0-20180303142811-b89eecf5ca5d/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
Expand Down
77 changes: 58 additions & 19 deletions pkg/backend/vxlan/vxlan_network.go
Expand Up @@ -24,6 +24,7 @@ import (

"github.com/flannel-io/flannel/pkg/backend"
"github.com/flannel-io/flannel/pkg/ip"
"github.com/flannel-io/flannel/pkg/retry"
"github.com/flannel-io/flannel/pkg/subnet"
"github.com/vishvananda/netlink"
"golang.org/x/net/context"
Expand Down Expand Up @@ -170,22 +171,30 @@ func (nw *network) handleSubnetEvents(batch []subnet.Event) {
if directRoutingOK {
log.V(2).Infof("Adding direct route to subnet: %s PublicIP: %s", sn, attrs.PublicIP)

if err := netlink.RouteReplace(&directRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteReplace(&directRoute)
}); err != nil {
log.Errorf("Error adding route to %v via %v: %v", sn, attrs.PublicIP, err)
continue
}
} else {
log.V(2).Infof("adding subnet: %s PublicIP: %s VtepMAC: %s", sn, attrs.PublicIP, net.HardwareAddr(vxlanAttrs.VtepMAC))
if err := nw.dev.AddARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.dev.AddARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("AddARP failed: ", err)
continue
}

if err := nw.dev.AddFDB(neighbor{IP: attrs.PublicIP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.dev.AddFDB(neighbor{IP: attrs.PublicIP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("AddFDB failed: ", err)

// Try to clean up the ARP entry then continue
if err := nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.dev.DelARP(neighbor{IP: event.Lease.Subnet.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelARP failed: ", err)
}

Expand All @@ -194,7 +203,9 @@ func (nw *network) handleSubnetEvents(batch []subnet.Event) {

// Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure
// this is done last.
if err := netlink.RouteReplace(&vxlanRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteReplace(&vxlanRoute)
}); err != nil {
log.Errorf("failed to add vxlanRoute (%s -> %s): %v", vxlanRoute.Dst, vxlanRoute.Gw, err)

// Try to clean up both the ARP and FDB entries then continue
Expand All @@ -214,18 +225,24 @@ func (nw *network) handleSubnetEvents(batch []subnet.Event) {
if v6DirectRoutingOK {
log.V(2).Infof("Adding v6 direct route to v6 subnet: %s PublicIPv6: %s", v6Sn, attrs.PublicIPv6)

if err := netlink.RouteReplace(&v6DirectRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteReplace(&v6DirectRoute)
}); err != nil {
log.Errorf("Error adding v6 route to %v via %v: %v", v6Sn, attrs.PublicIPv6, err)
continue
}
} else {
log.V(2).Infof("adding v6 subnet: %s PublicIPv6: %s VtepMAC: %s", v6Sn, attrs.PublicIPv6, net.HardwareAddr(v6VxlanAttrs.VtepMAC))
if err := nw.v6Dev.AddV6ARP(neighbor{IP6: v6Sn.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.AddV6ARP(neighbor{IP6: v6Sn.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("AddV6ARP failed: ", err)
continue
}

if err := nw.v6Dev.AddV6FDB(neighbor{IP6: attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.AddV6FDB(neighbor{IP6: attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("AddV6FDB failed: ", err)

// Try to clean up the ARP entry then continue
Expand All @@ -238,15 +255,21 @@ func (nw *network) handleSubnetEvents(batch []subnet.Event) {

// Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure
// this is done last.
if err := netlink.RouteReplace(&v6VxlanRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteReplace(&v6VxlanRoute)
}); err != nil {
log.Errorf("failed to add v6 vxlanRoute (%s -> %s): %v", v6VxlanRoute.Dst, v6VxlanRoute.Gw, err)

// Try to clean up both the ARP and FDB entries then continue
if err := nw.v6Dev.DelV6ARP(neighbor{IP6: event.Lease.IPv6Subnet.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.DelV6ARP(neighbor{IP6: event.Lease.IPv6Subnet.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelV6ARP failed: ", err)
}

if err := nw.v6Dev.DelV6FDB(neighbor{IP6: event.Lease.Attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.DelV6FDB(neighbor{IP6: event.Lease.Attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelV6FDB failed: ", err)
}

Expand All @@ -258,45 +281,61 @@ func (nw *network) handleSubnetEvents(batch []subnet.Event) {
if event.Lease.EnableIPv4 {
if directRoutingOK {
log.V(2).Infof("Removing direct route to subnet: %s PublicIP: %s", sn, attrs.PublicIP)
if err := netlink.RouteDel(&directRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteDel(&directRoute)
}); err != nil {
log.Errorf("Error deleting route to %v via %v: %v", sn, attrs.PublicIP, err)
}
} else {
log.V(2).Infof("removing subnet: %s PublicIP: %s VtepMAC: %s", sn, attrs.PublicIP, net.HardwareAddr(vxlanAttrs.VtepMAC))

// Try to remove all entries - don't bail out if one of them fails.
if err := nw.dev.DelARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.dev.DelARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelARP failed: ", err)
}

if err := nw.dev.DelFDB(neighbor{IP: attrs.PublicIP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.dev.DelFDB(neighbor{IP: attrs.PublicIP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelFDB failed: ", err)
}

if err := netlink.RouteDel(&vxlanRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteDel(&vxlanRoute)
}); err != nil {
log.Errorf("failed to delete vxlanRoute (%s -> %s): %v", vxlanRoute.Dst, vxlanRoute.Gw, err)
}
}
}
if event.Lease.EnableIPv6 {
if v6DirectRoutingOK {
log.V(2).Infof("Removing v6 direct route to subnet: %s PublicIP: %s", sn, attrs.PublicIPv6)
if err := netlink.RouteDel(&directRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteDel(&directRoute)
}); err != nil {
log.Errorf("Error deleting v6 route to %v via %v: %v", v6Sn, attrs.PublicIPv6, err)
}
} else {
log.V(2).Infof("removing v6subnet: %s PublicIPv6: %s VtepMAC: %s", v6Sn, attrs.PublicIPv6, net.HardwareAddr(v6VxlanAttrs.VtepMAC))

// Try to remove all entries - don't bail out if one of them fails.
if err := nw.v6Dev.DelV6ARP(neighbor{IP6: v6Sn.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.DelV6ARP(neighbor{IP6: v6Sn.IP, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelV6ARP failed: ", err)
}

if err := nw.v6Dev.DelV6FDB(neighbor{IP6: attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)}); err != nil {
if err := retry.Do(func() error {
return nw.v6Dev.DelV6FDB(neighbor{IP6: attrs.PublicIPv6, MAC: net.HardwareAddr(v6VxlanAttrs.VtepMAC)})
}); err != nil {
log.Error("DelV6FDB failed: ", err)
}

if err := netlink.RouteDel(&v6VxlanRoute); err != nil {
if err := retry.Do(func() error {
return netlink.RouteDel(&v6VxlanRoute)
}); err != nil {
log.Errorf("failed to delete v6 vxlanRoute (%s -> %s): %v", v6VxlanRoute.Dst, v6VxlanRoute.Gw, err)
}
}
Expand Down
33 changes: 33 additions & 0 deletions pkg/retry/retry.go
@@ -0,0 +1,33 @@
// Copyright 2023 flannel authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package retry

import (
log "k8s.io/klog"

retry_v4 "github.com/avast/retry-go/v4"
)

// Do executes f until it does not return an error
// By default, the number of attempts is 10 with increasing delay between each
func Do(f func() error) error {

return retry_v4.Do(f,
retry_v4.OnRetry(func(n uint, err error) {
log.Errorf("#%d: %s\n", n, err)
}),
)

}