From 3121d2556a2d5d16deceba06bea9dcfe108d00f1 Mon Sep 17 00:00:00 2001 From: "changwoo.nam" Date: Fri, 10 Dec 2021 16:55:28 +0900 Subject: [PATCH] datapath: Enable DSR in the tunneling mode Adds DSR mode that can be used with tunl mode. In the added DSR mode, the packet is forwarded through IPIP tunl to the node where the actual Pod is located. To enable the added DSR mode, you need to add the settings below. ``` bpf-lb-mode: dsr-tunl enable-remote-node-identity: "True" ``` Fixes: #10114 Signed-off-by: changwoo.nam --- Documentation/cmdref/cilium-agent.md | 2 +- api/v1/models/kube_proxy_replacement.go | 7 +- api/v1/openapi.yaml | 1 + api/v1/server/embedded_spec.go | 12 +- bpf/bpf_host.c | 21 +++ bpf/bpf_lxc.c | 16 +- bpf/lib/common.h | 7 +- bpf/lib/lb.h | 54 ++++++- bpf/lib/nodeport.h | 189 ++++++++++++++++++++++-- cilium/cmd/bpf_lb_list.go | 4 +- daemon/cmd/daemon_main.go | 4 +- daemon/cmd/kube_proxy_replacement.go | 23 ++- pkg/datapath/linux/config/config.go | 6 +- pkg/maps/ctmap/ctmap.go | 3 +- pkg/maps/lbmap/ipv4.go | 24 +-- pkg/maps/lbmap/ipv6.go | 24 +-- pkg/maps/lbmap/lbmap.go | 12 +- pkg/maps/lbmap/types.go | 6 + pkg/maps/lbmap/zz_generated.deepcopy.go | 4 - pkg/option/config.go | 7 +- pkg/service/service.go | 10 ++ 21 files changed, 361 insertions(+), 75 deletions(-) diff --git a/Documentation/cmdref/cilium-agent.md b/Documentation/cmdref/cilium-agent.md index d5b311f551eea..e323c04854613 100644 --- a/Documentation/cmdref/cilium-agent.md +++ b/Documentation/cmdref/cilium-agent.md @@ -42,7 +42,7 @@ cilium-agent [flags] --bpf-lb-maglev-hash-seed string Maglev cluster-wide hash seed (base64 encoded) (default "JLfvgnHc2kaSUFaI") --bpf-lb-maglev-table-size uint Maglev per service backend table size (parameter M) (default 16381) --bpf-lb-map-max int Maximum number of entries in Cilium BPF lbmap (default 65536) - --bpf-lb-mode string BPF load balancing mode ("snat", "dsr", "hybrid") (default "snat") + --bpf-lb-mode string BPF load balancing mode ("snat", "dsr", "hybrid", "dsr-tunl") (default "snat") --bpf-lb-rss-ipv4-src-cidr string BPF load balancing RSS outer source IPv4 CIDR prefix for IPIP --bpf-lb-rss-ipv6-src-cidr string BPF load balancing RSS outer source IPv6 CIDR prefix for IPIP --bpf-lb-sock-hostns-only Skip socket LB for services when inside a pod namespace, in favor of service LB at the pod interface. Socket LB is still used when in the host namespace. Required by service mesh (e.g., Istio, Linkerd). diff --git a/api/v1/models/kube_proxy_replacement.go b/api/v1/models/kube_proxy_replacement.go index 47570cc627ccb..2b4436f05d09a 100644 --- a/api/v1/models/kube_proxy_replacement.go +++ b/api/v1/models/kube_proxy_replacement.go @@ -562,7 +562,7 @@ type KubeProxyReplacementFeaturesNodePort struct { LutSize int64 `json:"lutSize,omitempty"` // mode - // Enum: [SNAT DSR Hybrid] + // Enum: [SNAT DSR Hybrid DSR-Tunl] Mode string `json:"mode,omitempty"` // port max @@ -687,7 +687,7 @@ var kubeProxyReplacementFeaturesNodePortTypeModePropEnum []interface{} func init() { var res []string - if err := json.Unmarshal([]byte(`["SNAT","DSR","Hybrid"]`), &res); err != nil { + if err := json.Unmarshal([]byte(`["SNAT","DSR","Hybrid","DSR-Tunl"]`), &res); err != nil { panic(err) } for _, v := range res { @@ -705,6 +705,9 @@ const ( // KubeProxyReplacementFeaturesNodePortModeHybrid captures enum value "Hybrid" KubeProxyReplacementFeaturesNodePortModeHybrid string = "Hybrid" + + // KubeProxyReplacementFeaturesNodePortModeDSRTunl captures enum value "DSR-Tunl" + KubeProxyReplacementFeaturesNodePortModeDSRTunl string = "DSR-Tunl" ) // prop value enum diff --git a/api/v1/openapi.yaml b/api/v1/openapi.yaml index d41c92abdd482..9ee13a91f2762 100644 --- a/api/v1/openapi.yaml +++ b/api/v1/openapi.yaml @@ -1946,6 +1946,7 @@ definitions: - SNAT - DSR - Hybrid + - DSR-Tunl algorithm: type: string enum: diff --git a/api/v1/server/embedded_spec.go b/api/v1/server/embedded_spec.go index c5b330e67ba12..cf1174ded191f 100644 --- a/api/v1/server/embedded_spec.go +++ b/api/v1/server/embedded_spec.go @@ -2987,7 +2987,8 @@ func init() { "enum": [ "SNAT", "DSR", - "Hybrid" + "Hybrid", + "DSR-Tunl" ] }, "portMax": { @@ -7457,7 +7458,8 @@ func init() { "enum": [ "SNAT", "DSR", - "Hybrid" + "Hybrid", + "DSR-Tunl" ] }, "portMax": { @@ -7582,7 +7584,8 @@ func init() { "enum": [ "SNAT", "DSR", - "Hybrid" + "Hybrid", + "DSR-Tunl" ] }, "portMax": { @@ -7676,7 +7679,8 @@ func init() { "enum": [ "SNAT", "DSR", - "Hybrid" + "Hybrid", + "DSR-Tunl" ] }, "portMax": { diff --git a/bpf/bpf_host.c b/bpf/bpf_host.c index e7b2ee2f94521..456c3ca8fe772 100644 --- a/bpf/bpf_host.c +++ b/bpf/bpf_host.c @@ -842,6 +842,7 @@ do_netdev(struct __ctx_buff *ctx, __u16 proto, const bool from_host) { __u32 __maybe_unused identity = 0; __u32 __maybe_unused ipcache_srcid = 0; + __u32 __maybe_unused lb_selection_rule = 0; int ret; #ifdef ENABLE_IPSEC @@ -857,8 +858,19 @@ do_netdev(struct __ctx_buff *ctx, __u16 proto, const bool from_host) return CTX_ACT_OK; } #endif + +#ifdef ENABLE_DSR_TUNL + lb_selection_rule = ctx_load_meta(ctx, CB_LB_SELECTION_RULE); +#endif + bpf_clear_meta(ctx); +#ifdef ENABLE_DSR_TUNL + if ((lb_selection_rule & LB_LOCAL_BACKEND_ONLY) == LB_LOCAL_BACKEND_ONLY) { + ctx_store_meta(ctx, CB_LB_SELECTION_RULE, LB_LOCAL_BACKEND_ONLY); + } +#endif + if (from_host) { int trace = TRACE_FROM_HOST; bool from_proxy; @@ -969,8 +981,16 @@ handle_netdev(struct __ctx_buff *ctx, const bool from_host) __section("from-netdev") int from_netdev(struct __ctx_buff *ctx) { + int __maybe_unused ret = CTX_ACT_OK; __u32 __maybe_unused vlan_id; +#ifdef ENABLE_DSR_TUNL + ret = decap_ipip(ctx); + if (IS_ERR(ret)) { + return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_INGRESS); + } +#endif + /* Filter allowed vlan id's and pass them back to kernel. */ if (ctx->vlan_present) { @@ -1077,6 +1097,7 @@ int to_netdev(struct __ctx_buff *ctx __maybe_unused) #if defined(ENABLE_NODEPORT) && \ (!defined(ENABLE_DSR) || \ (defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \ + (defined(ENABLE_DSR) && defined(ENABLE_DSR_TUNL)) || \ defined(ENABLE_MASQUERADE) || \ defined(ENABLE_EGRESS_GATEWAY)) if ((ctx->mark & MARK_MAGIC_SNAT_DONE) != MARK_MAGIC_SNAT_DONE) { diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c index 07cbd466a6f35..9f8abe4fb4deb 100644 --- a/bpf/bpf_lxc.c +++ b/bpf/bpf_lxc.c @@ -277,13 +277,13 @@ static __always_inline int ipv6_l3_from_lxc(struct __ctx_buff *ctx, ep_tail_call(ctx, CILIUM_CALL_IPV6_NODEPORT_REVNAT); return DROP_MISSED_TAIL_CALL; } -# ifdef ENABLE_DSR +# if defined(ENABLE_DSR) && !defined(ENABLE_DSR_TUNL) if (ct_state.dsr) { ret = xlate_dsr_v6(ctx, tuple, l4_off); if (ret != 0) return ret; } -# endif /* ENABLE_DSR */ +# endif /* ENABLE_DSR && !ENABLE_DSR_TUNL */ #endif /* ENABLE_NODEPORT */ if (ct_state.rev_nat_index) { ret = lb6_rev_nat(ctx, l4_off, &csum_off, @@ -710,13 +710,13 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx, ep_tail_call(ctx, CILIUM_CALL_IPV4_NODEPORT_REVNAT); return DROP_MISSED_TAIL_CALL; } -# ifdef ENABLE_DSR +# if defined(ENABLE_DSR) && !defined(ENABLE_DSR_TUNL) if (ct_state.dsr) { ret = xlate_dsr_v4(ctx, &tuple, l4_off, has_l4_header); if (ret != 0) return ret; } -# endif /* ENABLE_DSR */ +# endif /* ENABLE_DSR && !ENABLE_DSR_TUNL */ #endif /* ENABLE_NODEPORT */ if (ct_state.rev_nat_index) { @@ -1120,7 +1120,7 @@ ipv6_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason, } if (ret == CT_NEW) { -#ifdef ENABLE_DSR +#if defined(ENABLE_DSR) && !defined(ENABLE_DSR_TUNL) { bool dsr = false; @@ -1130,7 +1130,7 @@ ipv6_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason, ct_state_new.dsr = dsr; } -#endif /* ENABLE_DSR */ +#endif /* ENABLE_DSR && !ENABLE_DSR_TUNL */ ct_state_new.src_sec_id = src_label; ct_state_new.node_port = ct_state.node_port; @@ -1422,7 +1422,7 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason, #endif /* !ENABLE_HOST_SERVICES_FULL && !DISABLE_LOOPBACK_LB */ if (ret == CT_NEW) { -#ifdef ENABLE_DSR +#if defined(ENABLE_DSR) && !defined(ENABLE_DSR_TUNL) { bool dsr = false; @@ -1432,7 +1432,7 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason, ct_state_new.dsr = dsr; } -#endif /* ENABLE_DSR */ +#endif /* ENABLE_DSR && !ENABLE_DSR_TUNL */ ct_state_new.src_sec_id = src_label; ct_state_new.node_port = ct_state.node_port; diff --git a/bpf/lib/common.h b/bpf/lib/common.h index 3129c6c139e60..1587f9dc71a89 100644 --- a/bpf/lib/common.h +++ b/bpf/lib/common.h @@ -598,6 +598,7 @@ enum { * Not used by xfrm. */ #define CB_CUSTOM_CALLS CB_CT_STATE /* Alias, non-overlapping */ +#define CB_LB_SELECTION_RULE CB_CT_STATE /* Alias, non-overlapping */ }; /* State values for NAT46 */ @@ -630,6 +631,8 @@ enum { CT_REOPENED, }; +#define LB_LOCAL_BACKEND_ONLY 0xFFA0 + /* Service flags (lb{4,6}_service->flags) */ enum { SVC_FLAG_EXTERNAL_IP = (1 << 0), /* External IPs */ @@ -736,10 +739,10 @@ struct lb6_service { __u32 affinity_timeout; /* In seconds, only for svc frontend */ }; __u16 count; + __u16 local_count; __u16 rev_nat_index; __u8 flags; __u8 flags2; - __u8 pad[2]; }; /* See lb4_backend comments */ @@ -790,10 +793,10 @@ struct lb4_service { * slots (otherwise zero). */ __u16 count; + __u16 local_count; __u16 rev_nat_index; /* Reverse NAT ID in lb4_reverse_nat */ __u8 flags; __u8 flags2; - __u8 pad[2]; }; struct lb4_backend { diff --git a/bpf/lib/lb.h b/bpf/lib/lb.h index 2cb0aad81f107..c05f66dd50090 100644 --- a/bpf/lib/lb.h +++ b/bpf/lib/lb.h @@ -9,6 +9,7 @@ #include "ipv4.h" #include "hash.h" #include "ids.h" +#include "eps.h" #ifdef ENABLE_IPV6 struct { @@ -266,7 +267,11 @@ bool lb6_svc_has_src_range_check(const struct lb6_service *svc __maybe_unused) static __always_inline bool lb_skip_l4_dnat(void) { +#ifdef ENABLE_DSR_TUNL + return false; +#else return DSR_XLATE_MODE == DSR_XLATE_FRONTEND; +#endif /* ENABLE_DSR_TUNL */ } static __always_inline @@ -1111,7 +1116,7 @@ struct lb4_service *lb4_lookup_backend_slot(struct __ctx_buff *ctx __maybe_unuse /* Backend slot 0 is always reserved for the service frontend. */ #if LB_SELECTION == LB_SELECTION_RANDOM static __always_inline __u32 -lb4_select_backend_id(struct __ctx_buff *ctx, +lb4_select_backend_id_internal(struct __ctx_buff *ctx, struct lb4_key *key, const struct ipv4_ct_tuple *tuple __maybe_unused, const struct lb4_service *svc) @@ -1123,7 +1128,7 @@ lb4_select_backend_id(struct __ctx_buff *ctx, } #elif LB_SELECTION == LB_SELECTION_MAGLEV static __always_inline __u32 -lb4_select_backend_id(struct __ctx_buff *ctx __maybe_unused, +lb4_select_backend_id_internal(struct __ctx_buff *ctx __maybe_unused, struct lb4_key *key __maybe_unused, const struct ipv4_ct_tuple *tuple, const struct lb4_service *svc) @@ -1147,6 +1152,37 @@ lb4_select_backend_id(struct __ctx_buff *ctx __maybe_unused, # error "Invalid load balancer backend selection algorithm!" #endif /* LB_SELECTION */ +static __always_inline __u16 +lb4_select_backend_id(struct __ctx_buff *ctx, + struct lb4_key *key, + const struct ipv4_ct_tuple *tuple __maybe_unused, + const struct lb4_service *svc) +{ + __u32 __maybe_unused lb_selection_rule = 0; + __u32 __maybe_unused slot = 0; + struct lb4_service __maybe_unused *be = NULL; + +#ifdef ENABLE_DSR_TUNL + if (lb4_svc_is_loadbalancer(svc) || lb4_svc_is_external_ip(svc)) { + lb_selection_rule = ctx_load_meta(ctx, CB_LB_SELECTION_RULE); + + if (svc->local_count > 0) { + slot = (get_prandom_u32() % svc->local_count) + 1; + be = lb4_lookup_backend_slot(ctx, key, slot); + if (be) { + return be->backend_id; + } + } + + if ((lb_selection_rule & LB_LOCAL_BACKEND_ONLY) == LB_LOCAL_BACKEND_ONLY) { + return 0; + } + } +#endif + + return lb4_select_backend_id_internal(ctx, key, tuple, svc); +} + static __always_inline int lb4_xlate(struct __ctx_buff *ctx, __be32 *new_daddr, __be32 *new_saddr __maybe_unused, __be32 *old_saddr __maybe_unused, __u8 nexthdr __maybe_unused, int l3_off, @@ -1316,11 +1352,16 @@ static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx, __u8 flags = tuple->flags; struct lb4_backend *backend; __u32 backend_id = 0; + bool r_skip_l3_xlate = skip_l3_xlate; int ret; #ifdef ENABLE_SESSION_AFFINITY union lb4_affinity_client_id client_id = { .client_ip = saddr, }; +#ifdef ENABLE_DSR_TUNL + bool backend_local; +#endif + #endif ret = ct_lookup4(map, tuple, ctx, l4_off, CT_SERVICE, state, &monitor); switch (ret) { @@ -1447,10 +1488,17 @@ static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx, #endif tuple->daddr = backend->address; +#ifdef ENABLE_DSR_TUNL + backend_local = __lookup_ip4_endpoint(tuple->daddr); + if (backend_local) { + r_skip_l3_xlate = false; + } +#endif + return lb_skip_l4_dnat() ? CTX_ACT_OK : lb4_xlate(ctx, &new_daddr, &new_saddr, &saddr, tuple->nexthdr, l3_off, l4_off, csum_off, key, - backend, has_l4_header, skip_l3_xlate); + backend, has_l4_header, r_skip_l3_xlate); drop_no_service: tuple->flags = flags; return DROP_NO_SERVICE; diff --git a/bpf/lib/nodeport.h b/bpf/lib/nodeport.h index 3c19817937fb7..3bffa9e3ca320 100644 --- a/bpf/lib/nodeport.h +++ b/bpf/lib/nodeport.h @@ -92,7 +92,9 @@ struct dsr_opt_v6 { static __always_inline bool nodeport_uses_dsr(__u8 nexthdr __maybe_unused) { -# if defined(ENABLE_DSR) && !defined(ENABLE_DSR_HYBRID) +# if defined(ENABLE_DSR_TUNL) + return true; +# elif defined(ENABLE_DSR) && !defined(ENABLE_DSR_HYBRID) return true; # elif defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID) if (nexthdr == IPPROTO_TCP) @@ -1297,6 +1299,47 @@ static __always_inline int dsr_set_ipip4(struct __ctx_buff *ctx, if (l3_csum_replace(ctx, l3_off + offsetof(struct iphdr, check), 0, sum, 0) < 0) return DROP_CSUM_L3; + + return 0; +} + +static __always_inline int dsr_tunl_set_ipip4(struct __ctx_buff *ctx, + const struct iphdr *ip4, + __be32 backend_addr, + __be32 l4_hint, int *ohead) +{ + __u16 tot_len = bpf_ntohs(ip4->tot_len) + sizeof(*ip4); + const int l3_off = ETH_HLEN; + __be32 sum __maybe_unused; + struct iphdr outer_ip4; + + outer_ip4 = *ip4; + outer_ip4.ihl = sizeof(struct iphdr) >> 2; + outer_ip4.version = IPVERSION; + outer_ip4.tot_len = bpf_htons(tot_len), + outer_ip4.ttl = IPDEFTTL; + outer_ip4.protocol = IPPROTO_IPIP; + outer_ip4.saddr = rss_gen_src4(ip4->saddr, l4_hint); + outer_ip4.daddr = backend_addr; + + if (dsr_is_too_big(ctx, tot_len)) { + *ohead = sizeof(*ip4); + return DROP_FRAG_NEEDED; + } + + if (ctx_adjust_hroom(ctx, sizeof(struct iphdr), BPF_ADJ_ROOM_MAC, + ctx_adjust_hroom_dsr_flags())) + return DROP_INVALID; + + sum = csum_diff(NULL, 0, &outer_ip4, sizeof(struct iphdr), 0); + if (ctx_store_bytes(ctx, l3_off, + &outer_ip4, sizeof(struct iphdr), 0) < 0) + return DROP_WRITE_ERROR; + + if (l3_csum_replace(ctx, l3_off + offsetof(struct iphdr, check), + 0, sum, 0) < 0) + return DROP_CSUM_L3; + return 0; } #elif DSR_ENCAP_MODE == DSR_ENCAP_NONE @@ -1509,24 +1552,45 @@ int tail_nodeport_ipv4_dsr(struct __ctx_buff *ctx) void *data, *data_end; int ret, ohead = 0; struct iphdr *ip4; - bool l2_hdr_required = true; + struct iphdr inner_ip4 __maybe_unused; + bool l2_hdr_required __maybe_unused = true; + __be32 node_addr __maybe_unused = 0; + __be32 backend_addr __maybe_unused = 0; + struct remote_endpoint_info *info __maybe_unused = NULL; if (!revalidate_data(ctx, &data, &data_end, &ip4)) { ret = DROP_INVALID; goto drop_err; } -#if DSR_ENCAP_MODE == DSR_ENCAP_IPIP +#if defined(ENABLE_DSR_TUNL) +# if DSR_ENCAP_MODE == DSR_ENCAP_IPIP + backend_addr = ctx_load_meta(ctx, CB_ADDR_V4); + info = lookup_ip4_remote_endpoint(backend_addr); + if (!info) { + ret = DROP_INVALID; + goto drop_err; + } + node_addr = info->tunnel_endpoint; + ret = dsr_tunl_set_ipip4(ctx, ip4, + node_addr, + ctx_load_meta(ctx, CB_HINT), &ohead); +# else +# error "Invalid load balancer DSR encapsulation mode!" +# endif +#else +# if DSR_ENCAP_MODE == DSR_ENCAP_IPIP ret = dsr_set_ipip4(ctx, ip4, ctx_load_meta(ctx, CB_ADDR_V4), ctx_load_meta(ctx, CB_HINT), &ohead); -#elif DSR_ENCAP_MODE == DSR_ENCAP_NONE +# elif DSR_ENCAP_MODE == DSR_ENCAP_NONE ret = dsr_set_opt4(ctx, ip4, - ctx_load_meta(ctx, CB_ADDR_V4), - ctx_load_meta(ctx, CB_PORT), &ohead); -#else -# error "Invalid load balancer DSR encapsulation mode!" -#endif + ctx_load_meta(ctx, CB_ADDR_V4), + ctx_load_meta(ctx, CB_PORT), &ohead); +# else +# error "Invalid load balancer DSR encapsulation mode!" +# endif +#endif /* ENABLE_DSR_TUNL */ if (unlikely(ret)) { if (dsr_fail_needs_reply(ret)) return dsr_reply_icmp4(ctx, ip4, ret, ohead); @@ -2206,5 +2270,112 @@ static __always_inline int handle_nat_fwd(struct __ctx_buff *ctx) return ret; } +#ifdef ENABLE_DSR_TUNL +static __always_inline int decap_ipv4_needed(struct __ctx_buff *ctx, bool *decap_ipv4_outer) +{ + void *data, *data_end; + struct iphdr *outer_ip4_hdr, inner_ipv4_hdr; + int ret, l3_off = ETH_HLEN, inner_ipv4_off, l4_off; + struct csum_offset csum_off = {}; + struct lb4_service *svc; + struct lb4_key key = {}; + + *decap_ipv4_outer = false; + + if (!revalidate_data(ctx, &data, &data_end, &outer_ip4_hdr)) + return DROP_INVALID; + + inner_ipv4_off = l3_off + ipv4_hdrlen(outer_ip4_hdr); + if (ctx_load_bytes(ctx, inner_ipv4_off, &inner_ipv4_hdr, + sizeof(inner_ipv4_hdr)) < 0) + return DROP_INVALID; + + l4_off = inner_ipv4_off + ipv4_hdrlen(&inner_ipv4_hdr); + ret = lb4_extract_key(ctx, &inner_ipv4_hdr, l4_off, &key, &csum_off, CT_EGRESS); + if (IS_ERR(ret)) { + if (ret == DROP_NO_SERVICE || ret == DROP_UNKNOWN_L4) + return CTX_ACT_OK; + else + return ret; + } + + svc = lb4_lookup_service(&key, false); + if (svc) { + if (lb4_svc_is_external_ip(svc) || lb4_svc_is_loadbalancer(svc)) { + *decap_ipv4_outer = true; + } + } + + return CTX_ACT_OK; +} + +static __always_inline int decap_ipv4_ipip(struct __ctx_buff *ctx, struct iphdr *iph_outer) +{ + int ret = CTX_ACT_OK; + int olen = ipv4_hdrlen(iph_outer); + bool decap_ipv4_outer = false; + struct remote_endpoint_info *info = NULL; + + if (iph_outer->protocol != IPPROTO_IPIP) { + return CTX_ACT_OK; + } + + ret = decap_ipv4_needed(ctx, &decap_ipv4_outer); + if (IS_ERR(ret)) { + return ret; + } + + if (!decap_ipv4_outer) { + return CTX_ACT_OK; + } + + info = lookup_ip4_remote_endpoint(iph_outer->saddr); + if (info != NULL && info->sec_label == REMOTE_NODE_ID) { + ctx_store_meta(ctx, CB_LB_SELECTION_RULE, LB_LOCAL_BACKEND_ONLY); + } + + if (ctx_adjust_hroom(ctx, -olen, BPF_ADJ_ROOM_MAC, + BPF_F_ADJ_ROOM_FIXED_GSO)) + return CTX_ACT_DROP; + + return CTX_ACT_OK; +} + +static __always_inline int decap_ipv4(struct __ctx_buff *ctx) +{ + struct iphdr iph_outer; + + if (ctx_load_bytes(ctx, ETH_HLEN, &iph_outer, sizeof(iph_outer)) < 0) + return CTX_ACT_OK; + + if (iph_outer.ihl != 5) { + return CTX_ACT_OK; + } + + return decap_ipv4_ipip(ctx, &iph_outer); +} + +static __always_inline int decap_ipip(struct __ctx_buff *ctx) +{ + int ret = CTX_ACT_OK; + + __u16 proto = ctx_get_protocol(ctx); + + switch (proto) { + case bpf_htons(ETH_P_IP): + return decap_ipv4(ctx); + + case bpf_htons(ETH_P_IPV6): + /* TODO - not supported yet */ + + default: + /* does not match, ignore */ + return CTX_ACT_OK; + } + + return ret; +} +#endif /* ENABLE_DSR_TUNL */ + #endif /* ENABLE_NODEPORT */ #endif /* __NODEPORT_H_ */ diff --git a/cilium/cmd/bpf_lb_list.go b/cilium/cmd/bpf_lb_list.go index 1ddb663ac7504..a908678c3693c 100644 --- a/cilium/cmd/bpf_lb_list.go +++ b/cilium/cmd/bpf_lb_list.go @@ -58,6 +58,8 @@ func dumpSVC(serviceList map[string][]string) { svc := svcKey.String() svcKey = svcKey.ToHost() revNATID := svcVal.GetRevNat() + count := svcVal.GetCount() + localCount := svcVal.GetLocalCount() backendID := svcVal.GetBackendID() flags := loadbalancer.ServiceFlags(svcVal.GetFlags()) @@ -66,7 +68,7 @@ func dumpSVC(serviceList map[string][]string) { if svcKey.IsIPv6() { ip = "[::]" } - entry = fmt.Sprintf("%s:%d (%d) [%s]", ip, 0, revNATID, flags) + entry = fmt.Sprintf("%s:%d (%d) (%d) (%d) [%s]", ip, 0, revNATID, count, localCount, flags) } else if backend, found := backendMap[backendID]; !found { entry = fmt.Sprintf("backend %d not found", backendID) } else { diff --git a/daemon/cmd/daemon_main.go b/daemon/cmd/daemon_main.go index 1e827a342644c..79e73623edb21 100644 --- a/daemon/cmd/daemon_main.go +++ b/daemon/cmd/daemon_main.go @@ -581,7 +581,7 @@ func initializeFlags() { flags.MarkHidden(option.CgroupPathMKE) option.BindEnv(option.CgroupPathMKE) - flags.String(option.NodePortMode, option.NodePortModeSNAT, "BPF NodePort mode (\"snat\", \"dsr\", \"hybrid\")") + flags.String(option.NodePortMode, option.NodePortModeSNAT, "BPF NodePort mode (\"snat\", \"dsr\", \"hybrid\", \"dsr-tunl\")") flags.MarkHidden(option.NodePortMode) option.BindEnv(option.NodePortMode) @@ -595,7 +595,7 @@ func initializeFlags() { flags.MarkHidden(option.NodePortAcceleration) option.BindEnv(option.NodePortAcceleration) - flags.String(option.LoadBalancerMode, option.NodePortModeSNAT, "BPF load balancing mode (\"snat\", \"dsr\", \"hybrid\")") + flags.String(option.LoadBalancerMode, option.NodePortModeSNAT, "BPF load balancing mode (\"snat\", \"dsr\", \"hybrid\", \"dsr-tunl\")") option.BindEnv(option.LoadBalancerMode) flags.String(option.LoadBalancerAlg, option.NodePortAlgRandom, "BPF load balancing algorithm (\"random\", \"maglev\")") diff --git a/daemon/cmd/kube_proxy_replacement.go b/daemon/cmd/kube_proxy_replacement.go index 671041f1af4c3..41869a48de40b 100644 --- a/daemon/cmd/kube_proxy_replacement.go +++ b/daemon/cmd/kube_proxy_replacement.go @@ -99,7 +99,8 @@ func initKubeProxyReplacementOptions() (bool, error) { if option.Config.NodePortMode != option.NodePortModeSNAT && option.Config.NodePortMode != option.NodePortModeDSR && - option.Config.NodePortMode != option.NodePortModeHybrid { + option.Config.NodePortMode != option.NodePortModeHybrid && + option.Config.NodePortMode != option.NodePortModeDSRTunl { return false, fmt.Errorf("Invalid value for --%s: %s", option.NodePortMode, option.Config.NodePortMode) } @@ -107,7 +108,9 @@ func initKubeProxyReplacementOptions() (bool, error) { option.Config.LoadBalancerDSRDispatch != option.DSRDispatchOption && option.Config.LoadBalancerDSRDispatch != option.DSRDispatchIPIP || option.Config.NodePortMode == option.NodePortModeHybrid && - option.Config.LoadBalancerDSRDispatch != option.DSRDispatchOption { + option.Config.LoadBalancerDSRDispatch != option.DSRDispatchOption || + option.Config.NodePortMode == option.NodePortModeDSRTunl && + option.Config.LoadBalancerDSRDispatch != option.DSRDispatchIPIP { return false, fmt.Errorf("Invalid value for --%s: %s", option.LoadBalancerDSRDispatch, option.Config.LoadBalancerDSRDispatch) } @@ -117,6 +120,12 @@ func initKubeProxyReplacementOptions() (bool, error) { return false, fmt.Errorf("Invalid value for --%s: %s", option.LoadBalancerDSRL4Xlate, option.Config.LoadBalancerDSRL4Xlate) } + if option.Config.NodePortMode == option.NodePortModeDSRTunl && + !option.Config.EnableRemoteNodeIdentity { + log.Infof("auto-enable %s due to %s mode", option.EnableRemoteNodeIdentity, option.NodePortModeDSRTunl) + option.Config.EnableRemoteNodeIdentity = true + } + if option.Config.LoadBalancerRSSv4CIDR != "" { ip, cidr, err := net.ParseCIDR(option.Config.LoadBalancerRSSv4CIDR) if ip.To4() == nil { @@ -152,7 +161,8 @@ func initKubeProxyReplacementOptions() (bool, error) { } if (option.Config.LoadBalancerRSSv4CIDR != "" || option.Config.LoadBalancerRSSv6CIDR != "") && - (option.Config.NodePortMode != option.NodePortModeDSR || + ((option.Config.NodePortMode != option.NodePortModeDSR && + option.Config.NodePortMode != option.NodePortModeDSRTunl) || option.Config.LoadBalancerDSRDispatch != option.DSRDispatchIPIP) { return false, fmt.Errorf("Invalid value for --%s/%s: currently only supported under IPIP dispatch for DSR", option.LoadBalancerRSSv4CIDR, option.LoadBalancerRSSv6CIDR) @@ -329,10 +339,9 @@ func initKubeProxyReplacementOptions() (bool, error) { if option.Config.EnableNodePort { if option.Config.TunnelingEnabled() && - option.Config.NodePortMode != option.NodePortModeSNAT { - - log.Warnf("Disabling NodePort's %q mode feature due to tunneling mode being enabled", - option.Config.NodePortMode) + option.Config.NodePortMode != option.NodePortModeSNAT && + option.Config.NodePortMode != option.NodePortModeDSRTunl { + log.Warnf("Disabling NodePort's %q mode feature due to tunneling mode being enabled or not DSR-Tunl mode", option.Config.NodePortMode) option.Config.NodePortMode = option.NodePortModeSNAT } diff --git a/pkg/datapath/linux/config/config.go b/pkg/datapath/linux/config/config.go index 9f1ea1e347ea3..0b2310433d3b6 100644 --- a/pkg/datapath/linux/config/config.go +++ b/pkg/datapath/linux/config/config.go @@ -316,7 +316,8 @@ func (h *HeaderfileWriter) WriteNodeConfig(w io.Writer, cfg *datapath.LocalNodeC cDefinesMap["DSR_XLATE_FRONTEND"] = fmt.Sprintf("%d", dsrL4XlateFrontend) cDefinesMap["DSR_XLATE_BACKEND"] = fmt.Sprintf("%d", dsrL4XlateBackend) if option.Config.NodePortMode == option.NodePortModeDSR || - option.Config.NodePortMode == option.NodePortModeHybrid { + option.Config.NodePortMode == option.NodePortModeHybrid || + option.Config.NodePortMode == option.NodePortModeDSRTunl { cDefinesMap["ENABLE_DSR"] = "1" if option.Config.LoadBalancerPMTUDiscovery { cDefinesMap["ENABLE_DSR_ICMP_ERRORS"] = "1" @@ -324,6 +325,9 @@ func (h *HeaderfileWriter) WriteNodeConfig(w io.Writer, cfg *datapath.LocalNodeC if option.Config.NodePortMode == option.NodePortModeHybrid { cDefinesMap["ENABLE_DSR_HYBRID"] = "1" } + if option.Config.NodePortMode == option.NodePortModeDSRTunl { + cDefinesMap["ENABLE_DSR_TUNL"] = "1" + } if option.Config.LoadBalancerDSRDispatch == option.DSRDispatchOption { cDefinesMap["DSR_ENCAP_MODE"] = fmt.Sprintf("%d", dsrEncapNone) } else if option.Config.LoadBalancerDSRDispatch == option.DSRDispatchIPIP { diff --git a/pkg/maps/ctmap/ctmap.go b/pkg/maps/ctmap/ctmap.go index 01798a6c17c76..8a94de26b941e 100644 --- a/pkg/maps/ctmap/ctmap.go +++ b/pkg/maps/ctmap/ctmap.go @@ -564,7 +564,8 @@ func GC(m *Map, filter *GCFilter) int { // The function only handles 1-3 cases, the 4. case is TODO(brb). func PurgeOrphanNATEntries(ctMapTCP, ctMapAny *Map) *NatGCStats { if option.Config.NodePortMode == option.NodePortModeDSR || - option.Config.NodePortMode == option.NodePortModeHybrid { + option.Config.NodePortMode == option.NodePortModeHybrid || + option.Config.NodePortMode == option.NodePortModeDSRTunl { return nil } diff --git a/pkg/maps/lbmap/ipv4.go b/pkg/maps/lbmap/ipv4.go index 65fab0f39bdfc..f844417e2b366 100644 --- a/pkg/maps/lbmap/ipv4.go +++ b/pkg/maps/lbmap/ipv4.go @@ -291,12 +291,12 @@ func (k *Service4Key) ToHost() ServiceKey { // +k8s:deepcopy-gen=true // +k8s:deepcopy-gen:interfaces=github.com/cilium/cilium/pkg/bpf.MapValue type Service4Value struct { - BackendID uint32 `align:"backend_id"` - Count uint16 `align:"count"` - RevNat uint16 `align:"rev_nat_index"` - Flags uint8 `align:"flags"` - Flags2 uint8 `align:"flags2"` - Pad pad2uint8 `align:"pad"` + BackendID uint32 `align:"backend_id"` + Count uint16 `align:"count"` + LocalCount uint16 `align:"local_count"` + RevNat uint16 `align:"rev_nat_index"` + Flags uint8 `align:"flags"` + Flags2 uint8 `align:"flags2"` } func (s *Service4Value) String() string { @@ -306,11 +306,13 @@ func (s *Service4Value) String() string { func (s *Service4Value) GetValuePtr() unsafe.Pointer { return unsafe.Pointer(s) } -func (s *Service4Value) SetCount(count int) { s.Count = uint16(count) } -func (s *Service4Value) GetCount() int { return int(s.Count) } -func (s *Service4Value) SetRevNat(id int) { s.RevNat = uint16(id) } -func (s *Service4Value) GetRevNat() int { return int(s.RevNat) } -func (s *Service4Value) RevNatKey() RevNatKey { return &RevNat4Key{s.RevNat} } +func (s *Service4Value) SetCount(count int) { s.Count = uint16(count) } +func (s *Service4Value) GetCount() int { return int(s.Count) } +func (s *Service4Value) SetLocalCount(localCount int) { s.LocalCount = uint16(localCount) } +func (s *Service4Value) GetLocalCount() int { return int(s.LocalCount) } +func (s *Service4Value) SetRevNat(id int) { s.RevNat = uint16(id) } +func (s *Service4Value) GetRevNat() int { return int(s.RevNat) } +func (s *Service4Value) RevNatKey() RevNatKey { return &RevNat4Key{s.RevNat} } func (s *Service4Value) SetFlags(flags uint16) { s.Flags = uint8(flags & 0xff) s.Flags2 = uint8(flags >> 8) diff --git a/pkg/maps/lbmap/ipv6.go b/pkg/maps/lbmap/ipv6.go index 3911197587b16..a43af3e459b4e 100644 --- a/pkg/maps/lbmap/ipv6.go +++ b/pkg/maps/lbmap/ipv6.go @@ -190,12 +190,12 @@ func (k *Service6Key) ToHost() ServiceKey { // +k8s:deepcopy-gen=true // +k8s:deepcopy-gen:interfaces=github.com/cilium/cilium/pkg/bpf.MapValue type Service6Value struct { - BackendID uint32 `align:"backend_id"` - Count uint16 `align:"count"` - RevNat uint16 `align:"rev_nat_index"` - Flags uint8 `align:"flags"` - Flags2 uint8 `align:"flags2"` - Pad pad2uint8 `align:"pad"` + BackendID uint32 `align:"backend_id"` + Count uint16 `align:"count"` + LocalCount uint16 `align:"local_count"` + RevNat uint16 `align:"rev_nat_index"` + Flags uint8 `align:"flags"` + Flags2 uint8 `align:"flags2"` } func (s *Service6Value) String() string { @@ -205,11 +205,13 @@ func (s *Service6Value) String() string { func (s *Service6Value) GetValuePtr() unsafe.Pointer { return unsafe.Pointer(s) } -func (s *Service6Value) SetCount(count int) { s.Count = uint16(count) } -func (s *Service6Value) GetCount() int { return int(s.Count) } -func (s *Service6Value) SetRevNat(id int) { s.RevNat = uint16(id) } -func (s *Service6Value) GetRevNat() int { return int(s.RevNat) } -func (s *Service6Value) RevNatKey() RevNatKey { return &RevNat6Key{s.RevNat} } +func (s *Service6Value) SetCount(count int) { s.Count = uint16(count) } +func (s *Service6Value) GetCount() int { return int(s.Count) } +func (s *Service6Value) SetLocalCount(localCount int) { s.LocalCount = uint16(localCount) } +func (s *Service6Value) GetLocalCount() int { return int(s.LocalCount) } +func (s *Service6Value) SetRevNat(id int) { s.RevNat = uint16(id) } +func (s *Service6Value) GetRevNat() int { return int(s.RevNat) } +func (s *Service6Value) RevNatKey() RevNatKey { return &RevNat6Key{s.RevNat} } func (s *Service6Value) SetFlags(flags uint16) { s.Flags = uint8(flags & 0xff) s.Flags2 = uint8(flags >> 8) diff --git a/pkg/maps/lbmap/lbmap.go b/pkg/maps/lbmap/lbmap.go index 98200945649b6..b484d025c20cc 100644 --- a/pkg/maps/lbmap/lbmap.go +++ b/pkg/maps/lbmap/lbmap.go @@ -56,6 +56,8 @@ type UpsertServiceParams struct { IP net.IP Port uint16 Backends map[string]loadbalancer.BackendID + LocalBackends []uint16 + RemoteBackends []uint16 PrevActiveBackendCount int IPv6 bool Type loadbalancer.SVCType @@ -97,10 +99,7 @@ func (lbmap *LBBPFMap) UpsertService(p *UpsertServiceParams) error { } } - backendIDs := make([]loadbalancer.BackendID, 0, len(p.Backends)) - for _, id := range p.Backends { - backendIDs = append(backendIDs, id) - } + backendIDs := append(p.LocalBackends, p.RemoteBackends...) for _, backendID := range backendIDs { if backendID == 0 { return fmt.Errorf("Invalid backend ID 0") @@ -130,7 +129,7 @@ func (lbmap *LBBPFMap) UpsertService(p *UpsertServiceParams) error { return fmt.Errorf("Unable to update reverse NAT %+v => %+v: %s", revNATKey, revNATValue, err) } - if err := updateMasterService(svcKey, len(backendIDs), int(p.ID), p.Type, p.Local, + if err := updateMasterService(svcKey, len(backendIDs), len(p.LocalBackends), int(p.ID), p.Type, p.Local, p.SessionAffinity, p.SessionAffinityTimeoutSec, p.CheckSourceRange); err != nil { deleteRevNatLocked(revNATKey) @@ -499,7 +498,7 @@ func (*LBBPFMap) IsMaglevLookupTableRecreated(ipv6 bool) bool { return maglevRecreatedIPv4 } -func updateMasterService(fe ServiceKey, nbackends int, revNATID int, svcType loadbalancer.SVCType, +func updateMasterService(fe ServiceKey, nbackends, nLocalBackends, revNATID int, svcType loadbalancer.SVCType, svcLocal bool, sessionAffinity bool, sessionAffinityTimeoutSec uint32, checkSourceRange bool) error { @@ -510,6 +509,7 @@ func updateMasterService(fe ServiceKey, nbackends int, revNATID int, svcType loa fe.SetBackendSlot(0) zeroValue := fe.NewValue().(ServiceValue) zeroValue.SetCount(nbackends) + zeroValue.SetLocalCount(nLocalBackends) zeroValue.SetRevNat(revNATID) flag := loadbalancer.NewSvcFlag(&loadbalancer.SvcFlagParam{ SvcType: svcType, diff --git a/pkg/maps/lbmap/types.go b/pkg/maps/lbmap/types.go index cf9d81df1fe6f..7bc792a7dbc8b 100644 --- a/pkg/maps/lbmap/types.go +++ b/pkg/maps/lbmap/types.go @@ -65,6 +65,12 @@ type ServiceValue interface { // Get the number of backends GetCount() int + // Set the number of local backends + SetLocalCount(int) + + // Get the number of local backends + GetLocalCount() int + // Set reverse NAT identifier SetRevNat(int) diff --git a/pkg/maps/lbmap/zz_generated.deepcopy.go b/pkg/maps/lbmap/zz_generated.deepcopy.go index 6df268486ae6c..87117fc5d002a 100644 --- a/pkg/maps/lbmap/zz_generated.deepcopy.go +++ b/pkg/maps/lbmap/zz_generated.deepcopy.go @@ -381,7 +381,6 @@ func (in *RevNat6Value) DeepCopyMapValue() bpf.MapValue { func (in *Service4Key) DeepCopyInto(out *Service4Key) { *out = *in in.Address.DeepCopyInto(&out.Address) - in.Pad.DeepCopyInto(&out.Pad) return } @@ -406,7 +405,6 @@ func (in *Service4Key) DeepCopyMapKey() bpf.MapKey { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Service4Value) DeepCopyInto(out *Service4Value) { *out = *in - in.Pad.DeepCopyInto(&out.Pad) return } @@ -432,7 +430,6 @@ func (in *Service4Value) DeepCopyMapValue() bpf.MapValue { func (in *Service6Key) DeepCopyInto(out *Service6Key) { *out = *in in.Address.DeepCopyInto(&out.Address) - in.Pad.DeepCopyInto(&out.Pad) return } @@ -457,7 +454,6 @@ func (in *Service6Key) DeepCopyMapKey() bpf.MapKey { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Service6Value) DeepCopyInto(out *Service6Value) { *out = *in - in.Pad.DeepCopyInto(&out.Pad) return } diff --git a/pkg/option/config.go b/pkg/option/config.go index 8a01a9b39a03f..98f119c3cf412 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -239,7 +239,7 @@ const ( EnableSVCSourceRangeCheck = "enable-svc-source-range-check" // NodePortMode indicates in which mode NodePort implementation should run - // ("snat", "dsr" or "hybrid") + // ("snat", "dsr", "hybrid" or "dsr-tunl") NodePortMode = "node-port-mode" // NodePortAlg indicates which algorithm is used for backend selection @@ -1087,6 +1087,9 @@ const ( // NodePortModeHybrid is a dual mode of the above, that is, DSR for TCP and SNAT for UDP NodePortModeHybrid = "hybrid" + // NodePortModeDSRTunl is a special dsr mode implemented by ipip tunneling + NodePortModeDSRTunl = "dsr-tunl" + // DSR dispatch mode to encode service into IP option or extension header DSRDispatchOption = "opt" @@ -1707,7 +1710,7 @@ type DaemonConfig struct { EnableHostLegacyRouting bool // NodePortMode indicates in which mode NodePort implementation should run - // ("snat", "dsr" or "hybrid") + // ("snat", "dsr", "hybrid" or "dsr-tunl") NodePortMode string // NodePortAlg indicates which backend selection algorithm is used diff --git a/pkg/service/service.go b/pkg/service/service.go index 0c794d3d1e60e..78edfc03590f8 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -786,6 +786,8 @@ func (s *Service) upsertServiceIntoLBMaps(svc *svcInfo, onlyLocalBackends bool, // Upsert service entries into BPF maps backends := make(map[string]lb.BackendID, len(svc.backends)) + localBackends := make([]uint16, 0) + remoteBackends := make([]uint16, 0) activeBackendsCount := 0 for _, b := range svc.backends { // Skip adding the terminating backend to the service map so that it @@ -798,6 +800,12 @@ func (s *Service) upsertServiceIntoLBMaps(svc *svcInfo, onlyLocalBackends bool, if !b.Terminating { backends[b.String()] = b.ID activeBackendsCount++ + + if b.NodeName == nodeTypes.GetName() { + localBackends = append(localBackends, uint16(b.ID)) + continue + } + remoteBackends = append(remoteBackends, uint16(b.ID)) } } svc.activeBackendsCount = activeBackendsCount @@ -807,6 +815,8 @@ func (s *Service) upsertServiceIntoLBMaps(svc *svcInfo, onlyLocalBackends bool, IP: svc.frontend.L3n4Addr.IP, Port: svc.frontend.L3n4Addr.L4Addr.Port, Backends: backends, + LocalBackends: localBackends, + RemoteBackends: remoteBackends, PrevActiveBackendCount: prevActiveBackendCount, IPv6: ipv6, Type: svc.svcType,