Skip to content

Commit

Permalink
Fix wrong csum for non-first ipv4 fragments
Browse files Browse the repository at this point in the history
Before this commit, natting are blindly changing l4 ports to non-first ipv4
fragments, causing wrong csum.

Also fixes the same issue during the rev natting for dsr.

Signed-off-by: Yuan Liu <liuyuan@google.com>
  • Loading branch information
liuyuan10 authored and tklauser committed Oct 14, 2020
1 parent 350f0b3 commit 3453877
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 54 deletions.
15 changes: 10 additions & 5 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,11 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
bool hairpin_flow = false; /* endpoint wants to access itself via service IP */
__u8 policy_match_type = POLICY_MATCH_NONE;
__u8 audited = 0;
bool has_l4_header = false;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
has_l4_header = ipv4_has_l4_header(ip4);

tuple.nexthdr = ip4->protocol;

Expand Down Expand Up @@ -493,7 +495,7 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
if (svc) {
ret = lb4_local(get_ct_map4(&tuple), ctx, l3_off, l4_off,
&csum_off, &key, &tuple, svc, &ct_state_new,
ip4->saddr);
ip4->saddr, has_l4_header);
if (IS_ERR(ret))
return ret;
hairpin_flow |= ct_state_new.loopback;
Expand Down Expand Up @@ -610,7 +612,7 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
}
# ifdef ENABLE_DSR
if (ct_state.dsr) {
ret = xlate_dsr_v4(ctx, &tuple, l4_off);
ret = xlate_dsr_v4(ctx, &tuple, l4_off, has_l4_header);
if (ret != 0)
return ret;
}
Expand All @@ -619,7 +621,7 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,

if (ct_state.rev_nat_index) {
ret = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off,
&ct_state, &tuple, 0);
&ct_state, &tuple, 0, has_l4_header);
if (IS_ERR(ret))
return ret;
}
Expand Down Expand Up @@ -1106,13 +1108,15 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason,
struct ct_state ct_state_new = {};
bool skip_ingress_proxy = false;
bool is_untracked_fragment = false;
bool has_l4_header = false;
__u32 monitor = 0;
__be32 orig_sip;
__u8 policy_match_type = POLICY_MATCH_NONE;
__u8 audited = 0;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
has_l4_header = ipv4_has_l4_header(ip4);

policy_clear_mark(ctx);
tuple.nexthdr = ip4->protocol;
Expand All @@ -1127,7 +1131,8 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason,
orig_sip = ip4->saddr;

l4_off = l3_off + ipv4_hdrlen(ip4);
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
if (has_l4_header)
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);
#ifndef ENABLE_IPV4_FRAGMENTS
/* Indicate that this is a datagram fragment for which we cannot
* retrieve L4 ports. Do not set flag if we support fragmentation.
Expand Down Expand Up @@ -1169,7 +1174,7 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, __u8 *reason,

ret2 = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off,
&ct_state, &tuple,
REV_NAT_F_TUPLE_SADDR);
REV_NAT_F_TUPLE_SADDR, has_l4_header);
if (IS_ERR(ret2))
return ret2;
}
Expand Down
20 changes: 13 additions & 7 deletions bpf/lib/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,25 @@ static __always_inline bool ipv4_is_fragment(const struct iphdr *ip4)
return ip4->frag_off & bpf_htons(0x3FFF);
}

static __always_inline bool ipv4_is_in_subnet(__be32 addr,
__be32 subnet, int prefixlen)
{
return (addr & bpf_htonl(~((1<<(32-prefixlen))-1))) == subnet;
}

#ifdef ENABLE_IPV4_FRAGMENTS
static __always_inline bool ipv4_is_not_first_fragment(const struct iphdr *ip4)
{
/* Ignore "More fragments" bit to catch all fragments but the first */
return ip4->frag_off & bpf_htons(0x1FFF);
}

/* Simply a reverse of ipv4_is_not_first_fragment to avoid double negative. */
static __always_inline bool ipv4_has_l4_header(const struct iphdr *ip4)
{
return !ipv4_is_not_first_fragment(ip4);
}

static __always_inline bool ipv4_is_in_subnet(__be32 addr,
__be32 subnet, int prefixlen)
{
return (addr & bpf_htonl(~((1 << (32 - prefixlen)) - 1))) == subnet;
}

#ifdef ENABLE_IPV4_FRAGMENTS
static __always_inline int
ipv4_frag_get_l4ports(const struct ipv4_frag_id *frag_id,
struct ipv4_frag_l4ports *ports)
Expand Down
21 changes: 12 additions & 9 deletions bpf/lib/lb.h
Original file line number Diff line number Diff line change
Expand Up @@ -908,14 +908,14 @@ static __always_inline int __lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int
struct csum_offset *csum_off,
struct ipv4_ct_tuple *tuple, int flags,
const struct lb4_reverse_nat *nat,
const struct ct_state *ct_state)
const struct ct_state *ct_state, bool has_l4_header)
{
__be32 old_sip, new_sip, sum = 0;
int ret;

cilium_dbg_lb(ctx, DBG_LB4_REVERSE_NAT, nat->address, nat->port);

if (nat->port) {
if (nat->port && has_l4_header) {
ret = reverse_map_l4_port(ctx, tuple->nexthdr, nat->port, l4_off, csum_off);
if (IS_ERR(ret))
return ret;
Expand Down Expand Up @@ -986,7 +986,7 @@ static __always_inline int __lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int
static __always_inline int lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l4_off,
struct csum_offset *csum_off,
struct ct_state *ct_state,
struct ipv4_ct_tuple *tuple, int flags)
struct ipv4_ct_tuple *tuple, int flags, bool has_l4_header)
{
struct lb4_reverse_nat *nat;

Expand All @@ -996,7 +996,7 @@ static __always_inline int lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l
return 0;

return __lb4_rev_nat(ctx, l3_off, l4_off, csum_off, tuple, flags, nat,
ct_state);
ct_state, has_l4_header);
}

/** Extract IPv4 LB key from packet
Expand All @@ -1021,7 +1021,8 @@ static __always_inline int lb4_extract_key(struct __ctx_buff *ctx __maybe_unused
{
key->proto = ip4->protocol;
key->address = (dir == CT_INGRESS) ? ip4->saddr : ip4->daddr;
csum_l4_offset_and_flags(ip4->protocol, csum_off);
if (ipv4_has_l4_header(ip4))
csum_l4_offset_and_flags(ip4->protocol, csum_off);

return extract_l4_port(ctx, ip4->protocol, l4_off, &key->dport, ip4);
}
Expand Down Expand Up @@ -1155,7 +1156,7 @@ static __always_inline int
lb4_xlate(struct __ctx_buff *ctx, __be32 *new_daddr, __be32 *new_saddr __maybe_unused,
__be32 *old_saddr __maybe_unused, __u8 nexthdr __maybe_unused, int l3_off,
int l4_off, struct csum_offset *csum_off, struct lb4_key *key,
const struct lb4_backend *backend __maybe_unused)
const struct lb4_backend *backend __maybe_unused, bool has_l4_header)
{
__be32 sum;
int ret;
Expand Down Expand Up @@ -1186,8 +1187,10 @@ lb4_xlate(struct __ctx_buff *ctx, __be32 *new_daddr, __be32 *new_saddr __maybe_u
BPF_F_PSEUDO_HDR) < 0)
return DROP_CSUM_L4;
}

if (backend->port && key->dport != backend->port &&
(nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP)) {
(nexthdr == IPPROTO_TCP || nexthdr == IPPROTO_UDP) &&
has_l4_header) {
__be16 tmp = backend->port;

/* Port offsets for UDP and TCP are the same */
Expand Down Expand Up @@ -1298,7 +1301,7 @@ static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx,
struct lb4_key *key,
struct ipv4_ct_tuple *tuple,
const struct lb4_service *svc,
struct ct_state *state, __be32 saddr)
struct ct_state *state, __be32 saddr, bool has_l4_header)
{
__u32 monitor; /* Deliberately ignored; regular CT will determine monitoring. */
__be32 new_saddr = 0, new_daddr;
Expand Down Expand Up @@ -1438,7 +1441,7 @@ static __always_inline int lb4_local(const void *map, struct __ctx_buff *ctx,

return lb4_xlate(ctx, &new_daddr, &new_saddr, &saddr,
tuple->nexthdr, l3_off, l4_off, csum_off, key,
backend);
backend, has_l4_header);
drop_no_service:
tuple->flags = flags;
return DROP_NO_SERVICE;
Expand Down
59 changes: 31 additions & 28 deletions bpf/lib/nat.h
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ static __always_inline int snat_v4_handle_mapping(struct __ctx_buff *ctx,
static __always_inline int snat_v4_rewrite_egress(struct __ctx_buff *ctx,
struct ipv4_ct_tuple *tuple,
struct ipv4_nat_entry *state,
__u32 off)
__u32 off, bool has_l4_header)
{
struct csum_offset csum = {};
__be32 sum_l4 = 0, sum;
Expand All @@ -332,32 +332,35 @@ static __always_inline int snat_v4_rewrite_egress(struct __ctx_buff *ctx,
state->to_sport == tuple->sport)
return 0;
sum = csum_diff(&tuple->saddr, 4, &state->to_saddr, 4, 0);
csum_l4_offset_and_flags(tuple->nexthdr, &csum);
if (state->to_sport != tuple->sport) {
switch (tuple->nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
ret = l4_modify_port(ctx, off,
offsetof(struct tcphdr, source),
&csum, state->to_sport,
tuple->sport);
if (ret < 0)
return ret;
break;
case IPPROTO_ICMP: {
__be32 from, to;

if (ctx_store_bytes(ctx, off +
offsetof(struct icmphdr, un.echo.id),
&state->to_sport,
sizeof(state->to_sport), 0) < 0)
return DROP_WRITE_ERROR;
from = tuple->sport;
to = state->to_sport;
sum_l4 = csum_diff(&from, 4, &to, 4, 0);
csum.offset = offsetof(struct icmphdr, checksum);
break;
}}
if (has_l4_header) {
csum_l4_offset_and_flags(tuple->nexthdr, &csum);

if (state->to_sport != tuple->sport) {
switch (tuple->nexthdr) {
case IPPROTO_TCP:
case IPPROTO_UDP:
ret = l4_modify_port(ctx, off,
offsetof(struct tcphdr, source),
&csum, state->to_sport,
tuple->sport);
if (ret < 0)
return ret;
break;
case IPPROTO_ICMP: {
__be32 from, to;

if (ctx_store_bytes(ctx, off +
offsetof(struct icmphdr, un.echo.id),
&state->to_sport,
sizeof(state->to_sport), 0) < 0)
return DROP_WRITE_ERROR;
from = tuple->sport;
to = state->to_sport;
sum_l4 = csum_diff(&from, 4, &to, 4, 0);
csum.offset = offsetof(struct icmphdr, checksum);
break;
}}
}
}
if (ctx_store_bytes(ctx, ETH_HLEN + offsetof(struct iphdr, saddr),
&state->to_saddr, 4, 0) < 0)
Expand Down Expand Up @@ -556,7 +559,7 @@ static __always_inline __maybe_unused int snat_v4_process(struct __ctx_buff *ctx
return ret;

return dir == NAT_DIR_EGRESS ?
snat_v4_rewrite_egress(ctx, &tuple, state, off) :
snat_v4_rewrite_egress(ctx, &tuple, state, off, ipv4_has_l4_header(ip4)) :
snat_v4_rewrite_ingress(ctx, &tuple, state, off);
}
#else
Expand Down
11 changes: 6 additions & 5 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -1001,7 +1001,7 @@ static __always_inline int handle_dsr_v4(struct __ctx_buff *ctx, bool *dsr)

static __always_inline int xlate_dsr_v4(struct __ctx_buff *ctx,
const struct ipv4_ct_tuple *tuple,
int l4_off)
int l4_off, bool has_l4_header)
{
struct ipv4_ct_tuple nat_tup = *tuple;
struct ipv4_nat_entry *entry;
Expand All @@ -1013,7 +1013,7 @@ static __always_inline int xlate_dsr_v4(struct __ctx_buff *ctx,

entry = snat_v4_lookup(&nat_tup);
if (entry)
ret = snat_v4_rewrite_egress(ctx, &nat_tup, entry, l4_off);
ret = snat_v4_rewrite_egress(ctx, &nat_tup, entry, l4_off, has_l4_header);
return ret;
}

Expand Down Expand Up @@ -1239,8 +1239,9 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
if (!lb4_src_range_ok(svc, ip4->saddr))
return DROP_NOT_IN_SRC_RANGE;

ret = lb4_local(get_ct_map4(&tuple), ctx, l3_off, l4_off, &csum_off,
&key, &tuple, svc, &ct_state_new, ip4->saddr);
ret = lb4_local(get_ct_map4(&tuple), ctx, l3_off, l4_off,
&csum_off, &key, &tuple, svc, &ct_state_new,
ip4->saddr, ipv4_has_l4_header(ip4));
if (IS_ERR(ret))
return ret;
}
Expand Down Expand Up @@ -1391,7 +1392,7 @@ static __always_inline int rev_nodeport_lb4(struct __ctx_buff *ctx, int *ifindex
if (ret == CT_REPLY && ct_state.node_port == 1 && ct_state.rev_nat_index != 0) {
ret2 = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off,
&ct_state, &tuple,
REV_NAT_F_TUPLE_SADDR);
REV_NAT_F_TUPLE_SADDR, ipv4_has_l4_header(ip4));
if (IS_ERR(ret2))
return ret2;

Expand Down

0 comments on commit 3453877

Please sign in to comment.