Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpf: Do not pre-allocate BPF maps by default #6357

Merged
merged 5 commits into from
Jan 10, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Documentation/cmdref/cilium-agent.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ cilium-agent [flags]
--mtu int Overwrite auto-detected MTU of underlying network
--nat46-range string IPv6 prefix to map IPv4 addresses to (default "0:0:0:0:0:FFFF::/96")
--pprof Enable serving the pprof debugging API
--preallocate-bpf-maps Enable BPF map pre-allocation (default true)
--prefilter-device string Device facing external network for XDP prefiltering (default "undefined")
--prefilter-mode string Prefilter mode { native | generic } (default: native) (default "native")
--prepend-iptables-chains Prepend custom iptables chains instead of appending (default true)
Expand Down
12 changes: 12 additions & 0 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ struct bpf_elf_map __section_maps CT_MAP_TCP6 = {
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE_TCP,
#ifndef HAVE_LRU_MAP_TYPE
.flags = CONDITIONAL_PREALLOC,
#endif
};

struct bpf_elf_map __section_maps CT_MAP_ANY6 = {
Expand All @@ -71,6 +74,9 @@ struct bpf_elf_map __section_maps CT_MAP_ANY6 = {
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE_ANY,
#ifndef HAVE_LRU_MAP_TYPE
.flags = CONDITIONAL_PREALLOC,
#endif
};

static inline struct bpf_elf_map *
Expand All @@ -90,6 +96,9 @@ struct bpf_elf_map __section_maps CT_MAP_TCP4 = {
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE_TCP,
#ifndef HAVE_LRU_MAP_TYPE
.flags = CONDITIONAL_PREALLOC,
#endif
};

struct bpf_elf_map __section_maps CT_MAP_ANY4 = {
Expand All @@ -98,6 +107,9 @@ struct bpf_elf_map __section_maps CT_MAP_ANY4 = {
.size_value = sizeof(struct ct_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = CT_MAP_SIZE_ANY,
#ifndef HAVE_LRU_MAP_TYPE
.flags = CONDITIONAL_PREALLOC,
#endif
};

static inline struct bpf_elf_map *
Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@
#define EVENT_SOURCE 0
#endif

#ifdef PREALLOCATE_MAPS
#define CONDITIONAL_PREALLOC 0
#else
#define CONDITIONAL_PREALLOC BPF_F_NO_PREALLOC
#endif

#define __inline__ __attribute__((always_inline))

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/lb.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct bpf_elf_map __section_maps cilium_lb6_reverse_nat = {
.size_value = sizeof(struct lb6_reverse_nat),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_ENTRIES,
.flags = CONDITIONAL_PREALLOC,
};

struct bpf_elf_map __section_maps cilium_lb6_services = {
Expand All @@ -50,6 +51,7 @@ struct bpf_elf_map __section_maps cilium_lb6_services = {
.size_value = sizeof(struct lb6_service),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_ENTRIES,
.flags = CONDITIONAL_PREALLOC,
};

struct bpf_elf_map __section_maps cilium_lb6_rr_seq = {
Expand All @@ -58,6 +60,7 @@ struct bpf_elf_map __section_maps cilium_lb6_rr_seq = {
.size_value = sizeof(struct lb_sequence),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_FE,
.flags = CONDITIONAL_PREALLOC,
};
#endif /* ENABLE_IPV6 */

Expand All @@ -68,6 +71,7 @@ struct bpf_elf_map __section_maps cilium_lb4_reverse_nat = {
.size_value = sizeof(struct lb4_reverse_nat),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_ENTRIES,
.flags = CONDITIONAL_PREALLOC,
};

struct bpf_elf_map __section_maps cilium_lb4_services = {
Expand All @@ -76,6 +80,7 @@ struct bpf_elf_map __section_maps cilium_lb4_services = {
.size_value = sizeof(struct lb4_service),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_ENTRIES,
.flags = CONDITIONAL_PREALLOC,
};

struct bpf_elf_map __section_maps cilium_lb4_rr_seq = {
Expand All @@ -84,6 +89,7 @@ struct bpf_elf_map __section_maps cilium_lb4_rr_seq = {
.size_value = sizeof(struct lb_sequence),
.pinning = PIN_GLOBAL_NS,
.max_elem = CILIUM_LB_MAP_MAX_FE,
.flags = CONDITIONAL_PREALLOC,
};
#endif /* ENABLE_IPV4 */

Expand Down
6 changes: 6 additions & 0 deletions bpf/lib/maps.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct bpf_elf_map __section_maps cilium_lxc = {
.size_value = sizeof(struct endpoint_info),
.pinning = PIN_GLOBAL_NS,
.max_elem = ENDPOINTS_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};

struct bpf_elf_map __section_maps cilium_metrics = {
Expand All @@ -38,6 +39,7 @@ struct bpf_elf_map __section_maps cilium_metrics = {
.size_value = sizeof(struct metrics_value),
.pinning = PIN_GLOBAL_NS,
.max_elem = METRICS_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};

/* Global map to jump into policy enforcement of receiving endpoint */
Expand Down Expand Up @@ -69,6 +71,7 @@ struct bpf_elf_map __section_maps POLICY_MAP = {
.size_value = sizeof(struct policy_entry),
.pinning = PIN_GLOBAL_NS,
.max_elem = POLICY_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};
#endif

Expand All @@ -89,6 +92,7 @@ struct bpf_elf_map __section_maps cilium_proxy4 = {
.size_value = sizeof(struct proxy4_tbl_value),
.pinning = PIN_GLOBAL_NS,
.max_elem = PROXY_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};
#endif /* ENABLE_IPV4 */

Expand All @@ -99,6 +103,7 @@ struct bpf_elf_map __section_maps cilium_proxy6= {
.size_value = sizeof(struct proxy6_tbl_value),
.pinning = PIN_GLOBAL_NS,
.max_elem = PROXY_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};
#endif /* ENABLE_IPV6 */

Expand All @@ -122,6 +127,7 @@ struct bpf_elf_map __section_maps cilium_tunnel_map = {
.size_value = sizeof(struct endpoint_key),
.pinning = PIN_GLOBAL_NS,
.max_elem = TUNNEL_ENDPOINT_MAP_SIZE,
.flags = CONDITIONAL_PREALLOC,
};

#endif
Expand Down
2 changes: 1 addition & 1 deletion daemon/bpf.sha
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
GO_BINDATA_SHA1SUM=6c7bb1b20d6162fe92e001fec0d2f573dbfd6409
GO_BINDATA_SHA1SUM=e690fbe6cd7f96ea3865db6a3935e01f917c9935
BPF_FILES=../bpf/.gitignore ../bpf/COPYING ../bpf/Makefile ../bpf/Makefile.bpf ../bpf/bpf_features.h ../bpf/bpf_lb.c ../bpf/bpf_lxc.c ../bpf/bpf_netdev.c ../bpf/bpf_overlay.c ../bpf/bpf_xdp.c ../bpf/cilium-map-migrate.c ../bpf/filter_config.h ../bpf/include/bpf/api.h ../bpf/include/elf/elf.h ../bpf/include/elf/gelf.h ../bpf/include/elf/libelf.h ../bpf/include/iproute2/bpf_elf.h ../bpf/include/linux/bpf.h ../bpf/include/linux/bpf_common.h ../bpf/include/linux/byteorder.h ../bpf/include/linux/byteorder/big_endian.h ../bpf/include/linux/byteorder/little_endian.h ../bpf/include/linux/icmp.h ../bpf/include/linux/icmpv6.h ../bpf/include/linux/if_arp.h ../bpf/include/linux/if_ether.h ../bpf/include/linux/in.h ../bpf/include/linux/in6.h ../bpf/include/linux/ioctl.h ../bpf/include/linux/ip.h ../bpf/include/linux/ipv6.h ../bpf/include/linux/perf_event.h ../bpf/include/linux/swab.h ../bpf/include/linux/tcp.h ../bpf/include/linux/type_mapper.h ../bpf/include/linux/udp.h ../bpf/init.sh ../bpf/lib/arp.h ../bpf/lib/common.h ../bpf/lib/conntrack.h ../bpf/lib/csum.h ../bpf/lib/dbg.h ../bpf/lib/drop.h ../bpf/lib/encap.h ../bpf/lib/eps.h ../bpf/lib/eth.h ../bpf/lib/events.h ../bpf/lib/icmp6.h ../bpf/lib/ipv4.h ../bpf/lib/ipv6.h ../bpf/lib/l3.h ../bpf/lib/l4.h ../bpf/lib/lb.h ../bpf/lib/lxc.h ../bpf/lib/maps.h ../bpf/lib/metrics.h ../bpf/lib/nat46.h ../bpf/lib/policy.h ../bpf/lib/trace.h ../bpf/lib/utils.h ../bpf/lib/xdp.h ../bpf/lxc_config.h ../bpf/netdev_config.h ../bpf/node_config.h ../bpf/probes/raw_change_tail.t ../bpf/probes/raw_insn.h ../bpf/probes/raw_invalidate_hash.t ../bpf/probes/raw_lpm_map.t ../bpf/probes/raw_lru_map.t ../bpf/probes/raw_main.c ../bpf/probes/raw_map_val_adj.t ../bpf/probes/raw_mark_map_val.t ../bpf/run_probes.sh ../bpf/sockops/Makefile ../bpf/sockops/bpf_redir.c ../bpf/sockops/bpf_sockops.c ../bpf/sockops/bpf_sockops.h ../bpf/sockops/sockops_config.h ../bpf/spawn_netns.sh
34 changes: 6 additions & 28 deletions daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,34 +556,8 @@ func (d *Daemon) init() error {
return err
}

if option.Config.EnableIPv6 {
if _, err := lbmap.Service6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RevNat6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RRSeq6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := proxymap.Proxy6Map.OpenOrCreate(); err != nil {
return err
}
}

if option.Config.EnableIPv4 {
if _, err := lbmap.Service4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RevNat4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RRSeq4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := proxymap.Proxy4Map.OpenOrCreate(); err != nil {
return err
}
if err := openServiceMaps(); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for putting this into a separate function!

log.WithError(err).Fatal("Unable to open service maps")
}

if err := d.compileBase(); err != nil {
Expand Down Expand Up @@ -728,6 +702,10 @@ func (d *Daemon) createNodeConfigHeaderfile() error {
fmt.Fprintf(fw, "#define POLICY_PROG_MAP_SIZE %d\n", policymap.ProgArrayMaxEntries)
fmt.Fprintf(fw, "#define SOCKOPS_MAP_SIZE %d\n", sockmap.MaxEntries)

if option.Config.PreAllocateMaps {
fmt.Fprintf(fw, "#define PREALLOCATE_MAPS\n")
}

fmt.Fprintf(fw, "#define TRACE_PAYLOAD_LEN %dULL\n", option.Config.TracePayloadlen)
fmt.Fprintf(fw, "#define MTU %d\n", d.mtuConfig.GetDeviceMTU())

Expand Down
7 changes: 7 additions & 0 deletions daemon/daemon_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,9 @@ func init() {
flags.StringP(option.PrefilterMode, "", option.ModePreFilterNative, "Prefilter mode { "+option.ModePreFilterNative+" | "+option.ModePreFilterGeneric+" } (default: "+option.ModePreFilterNative+")")
option.BindEnv(option.PrefilterMode)

flags.Bool(option.PreAllocateMapsName, defaults.PreAllocateMaps, "Enable BPF map pre-allocation")
option.BindEnv(option.PreAllocateMapsName)

// We expect only one of the possible variables to be filled. The evaluation order is:
// --prometheus-serve-addr, CILIUM_PROMETHEUS_SERVE_ADDR, then PROMETHEUS_SERVE_ADDR
// The second environment variable (without the CILIUM_ prefix) is here to
Expand Down Expand Up @@ -713,6 +716,10 @@ func initEnv(cmd *cobra.Command) {
pprof.Enable()
}

if option.Config.PreAllocateMaps {
bpf.EnableMapPreAllocation()
}

scopedLog := log.WithFields(logrus.Fields{
logfields.Path + ".RunDir": option.Config.RunDir,
logfields.Path + ".LibDir": option.Config.LibDir,
Expand Down
37 changes: 36 additions & 1 deletion daemon/loadbalancer.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2016-2017 Authors of Cilium
// Copyright 2016-2019 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -22,6 +22,7 @@ import (
"github.com/cilium/cilium/pkg/loadbalancer"
"github.com/cilium/cilium/pkg/logging/logfields"
"github.com/cilium/cilium/pkg/maps/lbmap"
"github.com/cilium/cilium/pkg/maps/proxymap"
"github.com/cilium/cilium/pkg/option"
"github.com/cilium/cilium/pkg/service"

Expand Down Expand Up @@ -426,6 +427,40 @@ func (d *Daemon) RevNATDump() ([]loadbalancer.L3n4AddrID, error) {
return dump, nil
}

func openServiceMaps() error {
if option.Config.EnableIPv6 {
if _, err := lbmap.Service6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RevNat6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RRSeq6Map.OpenOrCreate(); err != nil {
return err
}
if _, err := proxymap.Proxy6Map.OpenOrCreate(); err != nil {
return err
}
}

if option.Config.EnableIPv4 {
if _, err := lbmap.Service4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RevNat4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := lbmap.RRSeq4Map.OpenOrCreate(); err != nil {
return err
}
if _, err := proxymap.Proxy4Map.OpenOrCreate(); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only slight nit I have here is that proxy maps are unrelated to the loadbalancer code, but I think having this code in a separate function is better than how it is currently in master.

return err
}
}

return nil
}

func restoreServiceIDs() {
svcMap, _, errors := lbmap.DumpServiceMapsToUserspace(true)
for _, err := range errors {
Expand Down
18 changes: 18 additions & 0 deletions examples/kubernetes/templates/v1/cilium-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,24 @@ data:
ct-global-max-entries-tcp: "524288"
ct-global-max-entries-other: "262144"

# Pre-allocation of map entries allows per-packet latency to be reduced, at
# the expense of up-front memory allocation for the entries in the maps. The
# default value below will minimize memory usage in the default installation;
# users who are sensitive to latency may consider setting this to "true".
#
# This option was introduced in Cilium 1.4. Cilium 1.3 and earlier ignore
# this option and behave as though it is set to "true".
#
# If this value is modified, then during the next Cilium startup the restore
# of existing endpoints and tracking of ongoing connections may be disrupted.
# This may lead to policy drops or a change in loadbalancing decisions for a
# connection for some time. Endpoints may need to be recreated to restore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implies that the state is not able to be recovered over time? Or is it just that because the applications may get into a bad state themselves vs. in a bad state on the Cilium side?

Copy link
Member

@joestringer joestringer Jan 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically Cilium completely forgets about endpoints that were previously managed, and they must be restarted in order for the new instance of Cilium to start managing them.

EDIT: The above is wrong, that was with clean-cilium-state enabled in the ConfigMap.

In local upgrade testing of just the Cilium component, it surprisingly seems to update pretty seamlessly (admittedly without testing services, which I'd expect to break due to loadbalancing decisions being lost). I suspect that in the upgrade test, because the etcd instance is being redeployed, some more serious problems occur during bootstrap which take some time to get back into a good state.

# connectivity.
#
# If this option is set to "false" during an upgrade from 1.3 or earlier to
# 1.4 or later, then it may cause one-time disruptions during the upgrade.
preallocate-bpf-maps: "false"

# Regular expression matching compatible Istio sidecar istio-proxy
# container image names
sidecar-istio-proxy-image: "cilium/istio_proxy"
Expand Down
6 changes: 6 additions & 0 deletions examples/kubernetes/templates/v1/cilium-ds.yaml.sed
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ spec:
key: ct-global-max-entries-other
name: cilium-config
optional: true
- name: CILIUM_PREALLOCATE_BPF_MAPS
valueFrom:
configMapKeyRef:
key: preallocate-bpf-maps
name: cilium-config
optional: true
image: docker.io/cilium/cilium:__CILIUM_VERSION__
imagePullPolicy: Always
lifecycle:
Expand Down
28 changes: 26 additions & 2 deletions pkg/bpf/bpf.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2016-2018 Authors of Cilium
// Copyright 2016-2019 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -15,11 +15,17 @@
package bpf

import (
"sync/atomic"

"github.com/cilium/cilium/pkg/logging"
"github.com/cilium/cilium/pkg/logging/logfields"
)

var log = logging.DefaultLogger.WithField(logfields.LogSubsys, "bpf")
var (
log = logging.DefaultLogger.WithField(logfields.LogSubsys, "bpf")

preAllocateMapSetting uint32 = BPF_F_NO_PREALLOC
)

const (
// BPF map type constants. Must match enum bpf_map_type from linux/bpf.h
Expand Down Expand Up @@ -88,3 +94,21 @@ const (
// Flag for stack_map, store build_id+offset instead of pointer
BPF_F_STACK_BUILD_ID = 1 << 5
)

// EnableMapPreAllocation enables BPF map pre-allocation on map types that
// support it.
func EnableMapPreAllocation() {
atomic.StoreUint32(&preAllocateMapSetting, 0)
}

// GetPreAllocateMapFlags returns the map flags for map which use conditional
// pre-allocation.
func GetPreAllocateMapFlags(t MapType) uint32 {
switch {
case !t.allowsPreallocation():
return BPF_F_NO_PREALLOC
case t.requiresPreallocation():
return 0
}
return atomic.LoadUint32(&preAllocateMapSetting)
}
Loading