Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

High-scale IPcache: Chapter 5 #25601

Merged
merged 6 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 11 additions & 12 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,8 @@ handle_srv6(struct __ctx_buff *ctx)
__section("from-netdev")
int cil_from_netdev(struct __ctx_buff *ctx)
{
__u32 __maybe_unused src_id = 0;

#ifdef ENABLE_NODEPORT_ACCELERATION
__u32 flags = ctx_get_xfer(ctx, XFER_FLAGS);
#ifdef HAVE_ENCAP
Expand Down Expand Up @@ -1288,6 +1290,15 @@ int cil_from_netdev(struct __ctx_buff *ctx)
#endif
#endif

#ifdef ENABLE_HIGH_SCALE_IPCACHE
ret = decapsulate_overlay(ctx, &src_id);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, src_id, ret, CTX_ACT_DROP,
METRIC_INGRESS);
if (ret == CTX_ACT_REDIRECT)
return ret;
#endif /* ENABLE_HIGH_SCALE_IPCACHE */

return handle_netdev(ctx, false);

drop_err:
Expand All @@ -1301,18 +1312,6 @@ int cil_from_netdev(struct __ctx_buff *ctx)
__section("from-host")
int cil_from_host(struct __ctx_buff *ctx)
{
#ifdef ENABLE_HIGH_SCALE_IPCACHE
__u32 src_id = 0;
int ret;

ret = decapsulate_overlay(ctx, &src_id);
if (IS_ERR(ret))
send_drop_notify_error(ctx, src_id, ret, CTX_ACT_DROP,
METRIC_INGRESS);
if (ret == CTX_ACT_REDIRECT)
return ret;
#endif /* ENABLE_HIGH_SCALE_IPCACHE */

/* Traffic from the host ns going through cilium_host device must
* not be subject to EDT rate-limiting.
*/
Expand Down
31 changes: 30 additions & 1 deletion bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2184,7 +2184,6 @@ int cil_to_container(struct __ctx_buff *ctx)
}
#endif /* ENABLE_HOST_FIREWALL && !ENABLE_ROUTING */

ctx_store_meta(ctx, CB_SRC_LABEL, identity);

switch (proto) {
#if defined(ENABLE_ARP_PASSTHROUGH) || defined(ENABLE_ARP_RESPONDER)
Expand All @@ -2194,12 +2193,42 @@ int cil_to_container(struct __ctx_buff *ctx)
#endif
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
# ifdef ENABLE_HIGH_SCALE_IPCACHE
if (identity == WORLD_ID) {
struct endpoint_info *ep;
void *data, *data_end;
struct ipv6hdr *ip6;

if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;

ep = __lookup_ip6_endpoint((union v6addr *)&ip6->saddr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be a stupid question: should it be ip6->daddr?

If the traffic is from remote, local endpoint map doesn't have an entry for the saddr, right? So this lookup can only be useful for local-to-local traffic. In that case, can we perform policy check at from-container's handle_ipv4_from_lxc, where we can get the dst_sec_id from local endpoint map if ipcache doesn't return an entry:

cilium/bpf/bpf_lxc.c

Lines 832 to 834 in 4535f13

info = lookup_ip4_remote_endpoint(ip4->daddr, cluster_id);
if (info && info->sec_identity) {
*dst_sec_identity = info->sec_identity;

if (ep)
identity = ep->sec_id;
}
# endif /* ENABLE_HIGH_SCALE_IPCACHE */
ctx_store_meta(ctx, CB_SRC_LABEL, identity);
ep_tail_call(ctx, CILIUM_CALL_IPV6_CT_INGRESS);
ret = DROP_MISSED_TAIL_CALL;
break;
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
# ifdef ENABLE_HIGH_SCALE_IPCACHE
if (identity == WORLD_ID) {
struct endpoint_info *ep;
void *data, *data_end;
struct iphdr *ip4;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;

ep = __lookup_ip4_endpoint(ip4->saddr);
if (ep)
identity = ep->sec_id;
}
# endif /* ENABLE_HIGH_SCALE_IPCACHE */
ctx_store_meta(ctx, CB_SRC_LABEL, identity);
ep_tail_call(ctx, CILIUM_CALL_IPV4_CT_INGRESS);
ret = DROP_MISSED_TAIL_CALL;
break;
Expand Down
5 changes: 3 additions & 2 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,12 @@ struct tunnel_value {
struct endpoint_info {
__u32 ifindex;
__u16 unused; /* used to be sec_label, no longer used */
__u16 lxc_id;
__u16 lxc_id;
__u32 flags;
mac_t mac;
mac_t node_mac;
__u32 pad[4];
__u32 sec_id;
__u32 pad[3];
};

struct edt_id {
Expand Down
7 changes: 1 addition & 6 deletions bpf/lib/encap.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,7 @@ encap_and_redirect_lxc(struct __ctx_buff *ctx,
struct tunnel_value *tunnel __maybe_unused;

#ifdef ENABLE_HIGH_SCALE_IPCACHE
/* If the destination doesn't match one of the world CIDRs, we assume
* it's destined to a remote pod. In that case, since the high-scale
* ipcache is enabled, we want to encapsulate with the remote pod's IP
* itself.
*/
if (!world_cidrs_lookup4(dst_ip))
if (needs_encapsulation(dst_ip))
return __encap_and_redirect_with_nodeid(ctx, src_ip, dst_ip,
seclabel, dstid,
NOT_VTEP_DST, trace);
Expand Down
21 changes: 21 additions & 0 deletions bpf/lib/high_scale_ipcache.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,27 @@ world_cidrs_lookup4(__u32 addr)
return matches != NULL;
}

static __always_inline bool
needs_encapsulation(__u32 addr)
{
# ifndef ENABLE_ROUTING
/* If endpoint routes are enabled, we need to check if the destination
* is a local endpoint, in which case we don't want to encapsulate. If
* endpoint routes are disabled, we don't need to check this because we
* will never reach this point and the packet will be redirected to the
* destination endpoint directly.
*/
if (__lookup_ip4_endpoint(addr))
return false;
# endif /* ENABLE_ROUTING */
/* If the destination doesn't match one of the world CIDRs, we assume
* it's destined to a remote pod. In that case, since the high-scale
* ipcache is enabled, we want to encapsulate with the remote pod's IP
* itself.
*/
return !world_cidrs_lookup4(addr);
}

static __always_inline int
decapsulate_overlay(struct __ctx_buff *ctx, __u32 *src_id)
{
Expand Down
15 changes: 11 additions & 4 deletions daemon/cmd/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1018,10 +1018,17 @@ func newDaemon(ctx context.Context, cleaner *daemonCleanup, params *daemonParams
return nil, nil, fmt.Errorf("BPF ip-masq-agent needs kernel 4.16 or newer")
}
}
if option.Config.EnableHostFirewall && len(option.Config.GetDevices()) == 0 {
msg := "host firewall's external facing device could not be determined. Use --%s to specify."
log.WithError(err).Errorf(msg, option.Devices)
return nil, nil, fmt.Errorf(msg, option.Devices)
if len(option.Config.GetDevices()) == 0 {
if option.Config.EnableHostFirewall {
msg := "Host firewall's external facing device could not be determined. Use --%s to specify."
log.WithError(err).Errorf(msg, option.Devices)
return nil, nil, fmt.Errorf(msg, option.Devices)
}
if option.Config.EnableHighScaleIPcache {
msg := "External facing device for high-scale IPcache could not be determined. Use --%s to specify."
log.WithError(err).Errorf(msg, option.Devices)
return nil, nil, fmt.Errorf(msg, option.Devices)
}
}
if option.Config.EnableSCTP {
if probes.HaveLargeInstructionLimit() != nil {
Expand Down
13 changes: 9 additions & 4 deletions pkg/maps/lxcmap/lxcmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"unsafe"

"github.com/cilium/cilium/pkg/bpf"
"github.com/cilium/cilium/pkg/identity"
"github.com/cilium/cilium/pkg/mac"
"github.com/cilium/cilium/pkg/option"
)
Expand Down Expand Up @@ -62,6 +63,7 @@ type EndpointFrontend interface {
GetID() uint64
IPv4Address() netip.Addr
IPv6Address() netip.Addr
GetIdentity() identity.NumericIdentity
}

// GetBPFKeys returns all keys which should represent this endpoint in the BPF
Expand Down Expand Up @@ -101,16 +103,17 @@ func GetBPFValue(e EndpointFrontend) (*EndpointInfo, error) {
LxcID: uint16(e.GetID()),
MAC: mac,
NodeMAC: nodeMAC,
SecID: e.GetIdentity().Uint32(),
}

return info, nil

}

type pad4uint32 [4]uint32
type pad3uint32 [3]uint32

// DeepCopyInto is a deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *pad4uint32) DeepCopyInto(out *pad4uint32) {
func (in *pad3uint32) DeepCopyInto(out *pad3uint32) {
copy(out[:], in[:])
return
}
Expand All @@ -129,7 +132,8 @@ type EndpointInfo struct {
_ uint32
MAC mac.Uint64MAC `align:"mac"`
NodeMAC mac.Uint64MAC `align:"node_mac"`
Pad pad4uint32 `align:"pad"`
SecID uint32 `align:"sec_id"`
Pad pad3uint32 `align:"pad"`
}

// GetValuePtr returns the unsafe pointer to the BPF value
Expand Down Expand Up @@ -164,8 +168,9 @@ func (v *EndpointInfo) String() string {
return "(localhost)"
}

return fmt.Sprintf("id=%-5d flags=0x%04X ifindex=%-3d mac=%s nodemac=%s",
return fmt.Sprintf("id=%-5d sec_id=%-5d flags=0x%04X ifindex=%-3d mac=%s nodemac=%s",
v.LxcID,
v.SecID,
v.Flags,
v.IfIndex,
v.MAC,
Expand Down
5 changes: 3 additions & 2 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2529,7 +2529,8 @@ func (c *DaemonConfig) TunnelExists() bool {
// AreDevicesRequired returns true if the agent needs to attach to the native
// devices to implement some features.
func (c *DaemonConfig) AreDevicesRequired() bool {
return c.EnableNodePort || c.EnableHostFirewall || c.EnableBandwidthManager || c.EnableWireguard
return c.EnableNodePort || c.EnableHostFirewall || c.EnableBandwidthManager ||
c.EnableWireguard || c.EnableHighScaleIPcache
}

// MasqueradingEnabled returns true if either IPv4 or IPv6 masquerading is enabled.
Expand Down Expand Up @@ -4124,7 +4125,7 @@ func EndpointStatusValuesMap() (values map[string]struct{}) {
// place.
func MightAutoDetectDevices() bool {
devices := Config.GetDevices()
return ((Config.EnableHostFirewall || Config.EnableWireguard) && len(devices) == 0) ||
return ((Config.EnableHostFirewall || Config.EnableWireguard || Config.EnableHighScaleIPcache) && len(devices) == 0) ||
(Config.KubeProxyReplacement != KubeProxyReplacementDisabled &&
(len(devices) == 0 || Config.DirectRoutingDevice == ""))
}
Expand Down
12 changes: 6 additions & 6 deletions test/k8s/datapath_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,15 +479,15 @@ var _ = Describe("K8sDatapathConfig", func() {
_ = kubectl.Delete(hsIPcacheYAML)
})

testHighScaleIPcache := func(tunnelProto string) {
testHighScaleIPcache := func(tunnelProto string, epRoutesConfig string) {
options := map[string]string{
"highScaleIPcache.enabled": "true",
"routingMode": "native",
"bpf.monitorAggregation": "none",
"devices": "",
"ipv6.enabled": "false",
"wellKnownIdentities.enabled": "true",
"tunnelProtocol": tunnelProto,
"endpointRoutes.enabled": epRoutesConfig,
}
if !helpers.RunsOnGKE() {
options["autoDirectNodeRoutes"] = "true"
Expand All @@ -509,12 +509,12 @@ var _ = Describe("K8sDatapathConfig", func() {
Expect(err).ToNot(HaveOccurred(), "Client pods not ready after timeout")
}

It("Test ingress policy enforcement with VXLAN", func() {
testHighScaleIPcache("vxlan")
It("Test ingress policy enforcement with VXLAN and no endpoint routes", func() {
testHighScaleIPcache("vxlan", "false")
})

It("Test ingress policy enforcement with GENEVE", func() {
testHighScaleIPcache("geneve")
It("Test ingress policy enforcement with GENEVE and endpoint routes", func() {
testHighScaleIPcache("geneve", "true")
})
})

Expand Down