-
Notifications
You must be signed in to change notification settings - Fork 2.9k
/
vxlan.go
240 lines (216 loc) · 8.1 KB
/
vxlan.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// Copyright 2015 flannel authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !windows
// +build !windows
package vxlan
// Some design notes and history:
// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
// The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
// - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
// an L2 miss (i.e. an ARP lookup)
// - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
// This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
// - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
// the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
// an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
// is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
//
// In this scheme the scaling of table entries (per host) is:
// - 1 route (for the configured network out the vxlan device)
// - One arp entry for each remote container that this host has recently contacted
// - One FDB entry for each remote host
//
// The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
// during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
//
//
// The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
// listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
// flannel crashes or restarts) and simplifies upgrades.
//
// How it works:
// Create the vxlan device but don't register for any L2MISS or L3MISS messages
// Then, as each remote host is discovered (either on startup or when they are added), do the following
// 1) Create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
// 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
// 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
//
// In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host
//
// In this newest scheme, there is also the option of skipping the use of vxlan for hosts that are on the same subnet,
// this is called "directRouting"
import (
"encoding/json"
"fmt"
"net"
"sync"
"github.com/flannel-io/flannel/pkg/backend"
"github.com/flannel-io/flannel/pkg/ip"
"github.com/flannel-io/flannel/pkg/lease"
"github.com/flannel-io/flannel/pkg/subnet"
"golang.org/x/net/context"
log "k8s.io/klog/v2"
)
func init() {
backend.Register("vxlan", New)
}
const (
defaultVNI = 1
)
type VXLANBackend struct {
subnetMgr subnet.Manager
extIface *backend.ExternalInterface
}
func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
backend := &VXLANBackend{
subnetMgr: sm,
extIface: extIface,
}
return backend, nil
}
func newSubnetAttrs(publicIP net.IP, publicIPv6 net.IP, vnid uint16, dev, v6Dev *vxlanDevice) (*lease.LeaseAttrs, error) {
leaseAttrs := &lease.LeaseAttrs{
BackendType: "vxlan",
}
if publicIP != nil && dev != nil {
data, err := json.Marshal(&vxlanLeaseAttrs{
VNI: vnid,
VtepMAC: hardwareAddr(dev.MACAddr()),
})
if err != nil {
return nil, err
}
leaseAttrs.PublicIP = ip.FromIP(publicIP)
leaseAttrs.BackendData = json.RawMessage(data)
}
if publicIPv6 != nil && v6Dev != nil {
data, err := json.Marshal(&vxlanLeaseAttrs{
VNI: vnid,
VtepMAC: hardwareAddr(v6Dev.MACAddr()),
})
if err != nil {
return nil, err
}
leaseAttrs.PublicIPv6 = ip.FromIP6(publicIPv6)
leaseAttrs.BackendV6Data = json.RawMessage(data)
}
return leaseAttrs, nil
}
func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup, config *subnet.Config) (backend.Network, error) {
// Parse our configuration
cfg := struct {
VNI int
Port int
MTU int
GBP bool
Learning bool
DirectRouting bool
}{
VNI: defaultVNI,
MTU: be.extIface.Iface.MTU,
}
if len(config.Backend) > 0 {
if err := json.Unmarshal(config.Backend, &cfg); err != nil {
return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err)
}
}
log.Infof("VXLAN config: VNI=%d Port=%d GBP=%v Learning=%v DirectRouting=%v", cfg.VNI, cfg.Port, cfg.GBP, cfg.Learning, cfg.DirectRouting)
var dev, v6Dev *vxlanDevice
var err error
if config.EnableIPv4 {
devAttrs := vxlanDeviceAttrs{
vni: uint32(cfg.VNI),
name: fmt.Sprintf("flannel.%v", cfg.VNI),
MTU: cfg.MTU,
vtepIndex: be.extIface.Iface.Index,
vtepAddr: be.extIface.IfaceAddr,
vtepPort: cfg.Port,
gbp: cfg.GBP,
learning: cfg.Learning,
}
dev, err = newVXLANDevice(&devAttrs)
if err != nil {
return nil, err
}
dev.directRouting = cfg.DirectRouting
}
if config.EnableIPv6 {
v6DevAttrs := vxlanDeviceAttrs{
vni: uint32(cfg.VNI),
name: fmt.Sprintf("flannel-v6.%v", cfg.VNI),
MTU: cfg.MTU,
vtepIndex: be.extIface.Iface.Index,
vtepAddr: be.extIface.IfaceV6Addr,
vtepPort: cfg.Port,
gbp: cfg.GBP,
learning: cfg.Learning,
}
v6Dev, err = newVXLANDevice(&v6DevAttrs)
if err != nil {
return nil, err
}
v6Dev.directRouting = cfg.DirectRouting
}
subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr, be.extIface.ExtV6Addr, uint16(cfg.VNI), dev, v6Dev)
if err != nil {
return nil, err
}
lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
switch err {
case nil:
case context.Canceled, context.DeadlineExceeded:
return nil, err
default:
return nil, fmt.Errorf("failed to acquire lease: %v", err)
}
// Ensure that the device has a /32 address so that no broadcast routes are created.
// This IP is just used as a source address for host to workload traffic (so
// the return path for the traffic has an address on the flannel network to use as the destination)
if config.EnableIPv4 {
net, err := config.GetFlannelNetwork(&lease.Subnet)
if err != nil {
return nil, err
}
if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}, net); err != nil {
return nil, fmt.Errorf("failed to configure interface %s: %w", dev.link.Attrs().Name, err)
}
}
if config.EnableIPv6 {
net, err := config.GetFlannelIPv6Network(&lease.IPv6Subnet)
if err != nil {
return nil, err
}
if err := v6Dev.ConfigureIPv6(ip.IP6Net{IP: lease.IPv6Subnet.IP, PrefixLen: 128}, net); err != nil {
return nil, fmt.Errorf("failed to configure interface %s: %w", v6Dev.link.Attrs().Name, err)
}
}
return newNetwork(be.subnetMgr, be.extIface, dev, v6Dev, ip.IP4Net{}, lease, cfg.MTU)
}
// So we can make it JSON (un)marshalable
type hardwareAddr net.HardwareAddr
func (hw hardwareAddr) MarshalJSON() ([]byte, error) {
return []byte(fmt.Sprintf("%q", net.HardwareAddr(hw))), nil
}
func (hw *hardwareAddr) UnmarshalJSON(bytes []byte) error {
if len(bytes) < 2 || bytes[0] != '"' || bytes[len(bytes)-1] != '"' {
return fmt.Errorf("error parsing hardware addr")
}
bytes = bytes[1 : len(bytes)-1]
mac, err := net.ParseMAC(string(bytes))
if err != nil {
return err
}
*hw = hardwareAddr(mac)
return nil
}