Skip to content

Commit

Permalink
cilium: Fix 16bit ifindex limitation
Browse files Browse the repository at this point in the history
[ upstream commit bd8b4d0 ]
[ manual conflict resolution in kube_proxy_replacement.go
  and nodeport.h due to difference from upstream, the latter
  mainly in locations where we ifdef ct_state.ifindex ]

The limitation exists mainly on old kernels where the fib lookup helper
does not populate the outgoing ifindex. Only for this case we rely on
the CT lookup stored ifindex which back then was added as a 16bit field
due to limited padding space available. Nowadays this can be lifted
after the big rework in #23884. We've seen users with high netdevice
churn run into this limitation where the agent bails out.

Apart from fixing the bleed, this can be further refined by not relying
on the asm.FnRedirectPeer helper presence but by actually doing a runtime
BPF program probe so that stable kernels can even be covered.

Fixes: #16260
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
borkmann committed Sep 1, 2023
1 parent 69d7bae commit 758e40b
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 8 deletions.
2 changes: 2 additions & 0 deletions bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,9 @@ struct ct_state {
__be32 svc_addr;
#endif
__u32 src_sec_id;
#ifndef HAVE_FIB_IFINDEX
__u16 ifindex;
#endif
__u32 backend_id; /* Backend ID in lb4_backends */
};

Expand Down
8 changes: 6 additions & 2 deletions bpf/lib/conntrack.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,9 @@ __ct_lookup(const void *map, struct __ctx_buff *ctx, const void *tuple,
ct_state->proxy_redirect = entry->proxy_redirect;
ct_state->from_l7lb = entry->from_l7lb;
ct_state->from_tunnel = entry->from_tunnel;
#ifndef HAVE_FIB_IFINDEX
ct_state->ifindex = entry->ifindex;
#endif
}
#ifdef CONNTRACK_ACCOUNTING
/* FIXME: This is slow, per-cpu counters? */
Expand Down Expand Up @@ -891,8 +893,9 @@ static __always_inline int ct_create6(const void *map_main, const void *map_rela
} else if (dir == CT_INGRESS || dir == CT_EGRESS) {
entry.node_port = ct_state->node_port;
entry.dsr = ct_state->dsr;
#ifndef HAVE_FIB_IFINDEX
entry.ifindex = ct_state->ifindex;

#endif
/* Note if this is a proxy connection so that replies can be redirected
* back to the proxy.
*/
Expand Down Expand Up @@ -969,8 +972,9 @@ static __always_inline int ct_create4(const void *map_main,
entry.node_port = ct_state->node_port;
entry.dsr = ct_state->dsr;
entry.from_tunnel = ct_state->from_tunnel;
#ifndef HAVE_FIB_IFINDEX
entry.ifindex = ct_state->ifindex;

#endif
/* Note if this is a proxy connection so that replies can be redirected
* back to the proxy.
*/
Expand Down
12 changes: 12 additions & 0 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,9 @@ int tail_nodeport_dsr_ingress_ipv6(struct __ctx_buff *ctx)

ct_state_new.src_sec_id = WORLD_ID;
ct_state_new.dsr = 1;
#ifndef HAVE_FIB_IFINDEX
ct_state_new.ifindex = (__u16)NATIVE_DEV_IFINDEX;
#endif
ret = ct_create6(get_ct_map6(&tuple), NULL, &tuple, ctx,
CT_EGRESS, &ct_state_new, false, false, &ext_err);
if (!IS_ERR(ret))
Expand Down Expand Up @@ -1116,7 +1118,9 @@ static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
redo:
ct_state_new.src_sec_id = WORLD_ID;
ct_state_new.node_port = 1;
#ifndef HAVE_FIB_IFINDEX
ct_state_new.ifindex = (__u16)NATIVE_DEV_IFINDEX;
#endif
ret = ct_create6(get_ct_map6(&tuple), NULL, &tuple, ctx,
CT_EGRESS, &ct_state_new, false, false, ext_err);
if (IS_ERR(ret))
Expand Down Expand Up @@ -1276,7 +1280,9 @@ static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, __s8 *ext_er
if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
ctx_snat_done_set(ctx);
#ifndef HAVE_FIB_IFINDEX
ifindex = ct_state.ifindex;
#endif
#ifdef TUNNEL_MODE
{
union v6addr *dst = (union v6addr *)&ip6->daddr;
Expand Down Expand Up @@ -2125,7 +2131,9 @@ int tail_nodeport_dsr_ingress_ipv4(struct __ctx_buff *ctx)

ct_state_new.src_sec_id = WORLD_ID;
ct_state_new.dsr = 1;
#ifndef HAVE_FIB_IFINDEX
ct_state_new.ifindex = (__u16)NATIVE_DEV_IFINDEX;
#endif
ret = ct_create4(get_ct_map4(&tuple), NULL, &tuple, ctx,
CT_EGRESS, &ct_state_new, false, false, &ext_err);
if (!IS_ERR(ret))
Expand Down Expand Up @@ -2506,7 +2514,9 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
redo:
ct_state_new.src_sec_id = WORLD_ID;
ct_state_new.node_port = 1;
#ifndef HAVE_FIB_IFINDEX
ct_state_new.ifindex = (__u16)NATIVE_DEV_IFINDEX;
#endif
ret = ct_create4(get_ct_map4(&tuple), NULL, &tuple, ctx,
CT_EGRESS, &ct_state_new, false, false, ext_err);
if (IS_ERR(ret))
Expand Down Expand Up @@ -2705,7 +2715,9 @@ static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, __s8 *ext_er
if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
ctx_snat_done_set(ctx);
#ifndef HAVE_FIB_IFINDEX
ifindex = ct_state.ifindex;
#endif
#if defined(TUNNEL_MODE)
{
struct remote_endpoint_info *info;
Expand Down
9 changes: 7 additions & 2 deletions daemon/cmd/kube_proxy_replacement.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,13 @@ func finishKubeProxyReplacementInit() error {
return fmt.Errorf("%s link name contains '=' or ';' character which is not allowed",
iface)
}
if idx := link.Attrs().Index; idx > math.MaxUint16 {
return fmt.Errorf("%s link ifindex %d exceeds max(uint16)", iface, idx)
// In the case where the fib lookup does not return the outgoing ifindex
// the datapath needs to store it in our CT map, and the map's field is
// limited to 16 bit.
if probes.HaveFibIfindex() != nil {
if idx := link.Attrs().Index; idx > math.MaxUint16 {
return fmt.Errorf("%s link ifindex %d exceeds max(uint16)", iface, idx)
}
}
}

Expand Down
14 changes: 10 additions & 4 deletions pkg/datapath/linux/probes/probes.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ type ProgramHelper struct {

type miscFeatures struct {
HaveLargeInsnLimit bool
HaveFibIfindex bool
}

type FeatureProbes struct {
Expand Down Expand Up @@ -409,6 +410,13 @@ func HaveBoundedLoops() error {
return nil
}

// HaveFibIfindex checks if kernel has d1c362e1dd68 ("bpf: Always return target
// ifindex in bpf_fib_lookup") which is 5.10+. This got merged in the same kernel
// as the new redirect helpers.
func HaveFibIfindex() error {
return features.HaveProgramHelper(ebpf.SchedCLS, asm.FnRedirectPeer)
}

// HaveV2ISA is a wrapper around features.HaveV2ISA() to check if the kernel
// supports the V2 ISA.
// On unexpected probe results this function will terminate with log.Fatal().
Expand Down Expand Up @@ -590,6 +598,7 @@ func ExecuteHeaderProbes() *FeatureProbes {
}

probes.Misc.HaveLargeInsnLimit = (HaveLargeInstructionLimit() == nil)
probes.Misc.HaveFibIfindex = (HaveFibIfindex() == nil)

return &probes
}
Expand All @@ -610,10 +619,7 @@ func writeCommonHeader(writer io.Writer, probes *FeatureProbes) error {
"HAVE_LARGE_INSN_LIMIT": probes.Misc.HaveLargeInsnLimit,
"HAVE_SET_RETVAL": probes.ProgramHelpers[ProgramHelper{ebpf.CGroupSock, asm.FnSetRetval}],
"HAVE_FIB_NEIGH": probes.ProgramHelpers[ProgramHelper{ebpf.SchedCLS, asm.FnRedirectNeigh}],
// Check if kernel has d1c362e1dd68 ("bpf: Always return target ifindex
// in bpf_fib_lookup") which is 5.10+. This got merged in the same kernel
// as the new redirect helpers.
"HAVE_FIB_IFINDEX": probes.ProgramHelpers[ProgramHelper{ebpf.SchedCLS, asm.FnRedirectPeer}],
"HAVE_FIB_IFINDEX": probes.Misc.HaveFibIfindex,
}

return writeFeatureHeader(writer, features, true)
Expand Down

0 comments on commit 758e40b

Please sign in to comment.