forked from cilium/cilium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bpf_netdev.c
615 lines (519 loc) · 16.6 KB
/
bpf_netdev.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
/*
* Copyright (C) 2016-2018 Authors of Cilium
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <node_config.h>
#include <netdev_config.h>
/* These are configuartion options which have a default value in their
* respective header files and must thus be defined beforehand:
*
* Pass unknown ICMPv6 NS to stack */
#define ACTION_UNKNOWN_ICMP6_NS TC_ACT_OK
/* Include policy_can_access_ingress() */
#define REQUIRES_CAN_ACCESS
#include <bpf/api.h>
#include <stdint.h>
#include <stdio.h>
#include "lib/utils.h"
#include "lib/common.h"
#include "lib/arp.h"
#include "lib/maps.h"
#include "lib/ipv6.h"
#include "lib/ipv4.h"
#include "lib/icmp6.h"
#include "lib/eth.h"
#include "lib/dbg.h"
#include "lib/trace.h"
#include "lib/l3.h"
#include "lib/l4.h"
#include "lib/policy.h"
#include "lib/drop.h"
#include "lib/encap.h"
#if defined FROM_HOST && (defined ENABLE_IPV4 || defined ENABLE_IPV6)
static inline int rewrite_dmac_to_host(struct __sk_buff *skb)
{
/* When attached to cilium_host, we rewrite the DMAC to the mac of
* cilium_host (peer) to ensure the packet is being considered to be
* addressed to the host (PACKET_HOST) */
union macaddr cilium_net_mac = CILIUM_NET_MAC;
/* Rewrite to destination MAC of cilium_net (remote peer) */
if (eth_store_daddr(skb, (__u8 *) &cilium_net_mac.addr, 0) < 0)
return send_drop_notify_error(skb, DROP_WRITE_ERROR, TC_ACT_OK, METRIC_INGRESS);
return TC_ACT_OK;
}
#endif
#if defined ENABLE_IPV4 || defined ENABLE_IPV6
static inline __u32 finalize_sec_ctx(__u32 secctx, __u32 src_identity)
{
#ifdef ENABLE_SECCTX_FROM_IPCACHE
/* If we could not derive the secctx from the packet itself but
* from the ipcache instead, then use the ipcache identity. E.g.
* used in ipvlan master device's datapath on ingress.
*/
if (secctx == WORLD_ID && !identity_is_reserved(src_identity))
secctx = src_identity;
#endif /* ENABLE_SECCTX_FROM_IPCACHE */
return secctx;
}
#endif
#ifdef ENABLE_IPV6
static inline __u32 derive_sec_ctx(struct __sk_buff *skb, const union v6addr *node_ip,
struct ipv6hdr *ip6)
{
#ifdef FIXED_SRC_SECCTX
return FIXED_SRC_SECCTX;
#else
if (ipv6_match_prefix_64((union v6addr *) &ip6->saddr, node_ip)) {
/* Read initial 4 bytes of header and then extract flowlabel */
__u32 *tmp = (__u32 *) ip6;
return bpf_ntohl(*tmp & IPV6_FLOWLABEL_MASK);
}
return WORLD_ID;
#endif
}
#ifdef FROM_HOST
static inline int __inline__
reverse_proxy6(struct __sk_buff *skb, int l4_off, struct ipv6hdr *ip6, __u8 nh)
{
struct proxy6_tbl_value *val;
struct proxy6_tbl_key key = {
.nexthdr = nh,
};
union v6addr new_saddr, old_saddr;
struct csum_offset csum = {};
__be16 new_sport, old_sport;
int ret;
switch (nh) {
case IPPROTO_TCP:
case IPPROTO_UDP:
/* load sport + dport in reverse order, sport=dport, dport=sport */
if (skb_load_bytes(skb, l4_off, &key.dport, 4) < 0)
return DROP_CT_INVALID_HDR;
break;
default:
/* ignore */
return 0;
}
ipv6_addr_copy(&key.saddr, (union v6addr *) &ip6->daddr);
ipv6_addr_copy(&old_saddr, (union v6addr *) &ip6->saddr);
csum_l4_offset_and_flags(nh, &csum);
val = map_lookup_elem(&PROXY6_MAP, &key);
if (!val)
return 0;
ipv6_addr_copy(&new_saddr, (union v6addr *)&val->orig_daddr);
new_sport = val->orig_dport;
old_sport = key.dport;
ret = l4_modify_port(skb, l4_off, TCP_SPORT_OFF, &csum, new_sport, old_sport);
if (ret < 0)
return DROP_WRITE_ERROR;
ret = ipv6_store_saddr(skb, new_saddr.addr, ETH_HLEN);
if (IS_ERR(ret))
return DROP_WRITE_ERROR;
if (csum.offset) {
__be32 sum = csum_diff(old_saddr.addr, 16, new_saddr.addr, 16, 0);
if (csum_l4_replace(skb, l4_off, &csum, 0, sum, BPF_F_PSEUDO_HDR) < 0)
return DROP_CSUM_L4;
}
/* Packets which have been translated back from the proxy must
* skip any potential ingress proxy at the endpoint
*/
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
return 0;
}
#endif
static inline int handle_ipv6(struct __sk_buff *skb, __u32 src_identity)
{
struct remote_endpoint_info *info;
union v6addr node_ip = { };
void *data, *data_end;
struct ipv6hdr *ip6;
union v6addr *dst;
int l4_off, l3_off = ETH_HLEN, hdrlen;
struct endpoint_info *ep;
__u8 nexthdr;
__u32 secctx;
if (!revalidate_data(skb, &data, &data_end, &ip6))
return DROP_INVALID;
nexthdr = ip6->nexthdr;
hdrlen = ipv6_hdrlen(skb, l3_off, &nexthdr);
if (hdrlen < 0)
return hdrlen;
l4_off = l3_off + hdrlen;
#ifdef HANDLE_NS
if (unlikely(nexthdr == IPPROTO_ICMPV6)) {
int ret = icmp6_handle(skb, ETH_HLEN, ip6, METRIC_INGRESS);
if (IS_ERR(ret))
return ret;
}
#endif
BPF_V6(node_ip, ROUTER_IP);
secctx = derive_sec_ctx(skb, &node_ip, ip6);
/* Packets from the proxy will already have a real identity. */
if (identity_is_reserved(src_identity)) {
union v6addr *src = (union v6addr *) &ip6->saddr;
info = ipcache_lookup6(&IPCACHE_MAP, src, V6_CACHE_KEY_LEN);
if (info != NULL) {
__u32 sec_label = info->sec_label;
if (sec_label)
src_identity = info->sec_label;
}
cilium_dbg(skb, info ? DBG_IP_ID_MAP_SUCCEED6 : DBG_IP_ID_MAP_FAILED6,
((__u32 *) src)[3], src_identity);
}
secctx = finalize_sec_ctx(secctx, src_identity);
#ifdef FROM_HOST
if (1) {
int ret;
secctx = src_identity;
ret = reverse_proxy6(skb, l4_off, ip6, ip6->nexthdr);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
/* If we are attached to cilium_host at egress, this will
* rewrite the destination mac address to the MAC of cilium_net */
ret = rewrite_dmac_to_host(skb);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
}
if (!revalidate_data(skb, &data, &data_end, &ip6))
return DROP_INVALID;
#endif
/* Lookup IPv4 address in list of local endpoints */
if ((ep = lookup_ip6_endpoint(ip6)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
return TC_ACT_OK;
return ipv6_local_delivery(skb, l3_off, l4_off, secctx, ip6, nexthdr, ep, METRIC_INGRESS);
}
#ifdef ENCAP_IFINDEX
dst = (union v6addr *) &ip6->daddr;
info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN);
if (info != NULL && info->tunnel_endpoint != 0) {
int ret = encap_and_redirect_with_nodeid(skb, info->tunnel_endpoint,
info->key,
secctx, TRACE_PAYLOAD_LEN);
/* If IPSEC is needed recirc through ingress to use xfrm stack
* and then result will routed back through bpf_netdev on egress
* but with encrypt marks.
*/
if (ret == IPSEC_ENDPOINT)
return TC_ACT_OK;
else
return ret;
} else if (likely(ipv6_match_prefix_96(dst, &node_ip))) {
struct endpoint_key key = {};
int ret;
/* IPv6 lookup key: daddr/96 */
dst = (union v6addr *) &ip6->daddr;
key.ip6.p1 = dst->p1;
key.ip6.p2 = dst->p2;
key.ip6.p3 = dst->p3;
key.ip6.p4 = 0;
key.family = ENDPOINT_KEY_IPV6;
ret = encap_and_redirect(skb, &key, secctx, TRACE_PAYLOAD_LEN, true);
if (ret == IPSEC_ENDPOINT)
return TC_ACT_OK;
else if (ret != DROP_NO_TUNNEL_ENDPOINT)
return ret;
}
#endif
#ifdef FROM_HOST
/* The destination IP address could not be associated with a local
* endpoint or a tunnel destination. If it is destined to an IP in
* the local range, then we can't route it back to the host as it
* will create a routing loop. Drop it. */
dst = (union v6addr *) &ip6->daddr;
if (ipv6_match_prefix_96(dst, &node_ip))
return DROP_NON_LOCAL;
#endif
return TC_ACT_OK;
}
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
static inline __u32 derive_ipv4_sec_ctx(struct __sk_buff *skb, struct iphdr *ip4)
{
#ifdef FIXED_SRC_SECCTX
return FIXED_SRC_SECCTX;
#else
return WORLD_ID;
#endif
}
#ifdef FROM_HOST
static inline int __inline__
reverse_proxy(struct __sk_buff *skb, int l4_off, struct iphdr *ip4, __u8 nh)
{
struct proxy4_tbl_value *val;
struct proxy4_tbl_key key = {
.saddr = ip4->daddr,
.nexthdr = nh,
};
__be32 new_saddr, old_saddr = ip4->saddr;
__be16 new_sport, old_sport;
struct csum_offset csum = {};
switch (nh) {
case IPPROTO_TCP:
case IPPROTO_UDP:
/* load sport + dport in reverse order, sport=dport, dport=sport */
if (skb_load_bytes(skb, l4_off, &key.dport, 4) < 0)
return DROP_CT_INVALID_HDR;
break;
default:
/* ignore */
return 0;
}
csum_l4_offset_and_flags(nh, &csum);
cilium_dbg3(skb, DBG_REV_PROXY_LOOKUP, key.sport << 16 | key.dport,
key.saddr, key.nexthdr);
val = map_lookup_elem(&PROXY4_MAP, &key);
if (!val)
return 0;
new_saddr = val->orig_daddr;
new_sport = val->orig_dport;
old_sport = key.dport;
cilium_dbg(skb, DBG_REV_PROXY_FOUND, new_saddr, bpf_ntohs(new_sport));
cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_PRE, 0);
if (l4_modify_port(skb, l4_off, TCP_SPORT_OFF, &csum, new_sport, old_sport) < 0)
return DROP_WRITE_ERROR;
if (skb_store_bytes(skb, ETH_HLEN + offsetof(struct iphdr, saddr), &new_saddr, 4, 0) < 0)
return DROP_WRITE_ERROR;
if (l3_csum_replace(skb, ETH_HLEN + offsetof(struct iphdr, check), old_saddr, new_saddr, 4) < 0)
return DROP_CSUM_L3;
if (csum.offset &&
csum_l4_replace(skb, l4_off, &csum, old_saddr, new_saddr, 4 | BPF_F_PSEUDO_HDR) < 0)
return DROP_CSUM_L4;
/* Packets which have been translated back from the proxy must
* skip any potential ingress proxy at the endpoint
*/
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
cilium_dbg_capture(skb, DBG_CAPTURE_PROXY_POST, 0);
return 0;
}
#endif
static inline int handle_ipv4(struct __sk_buff *skb, __u32 src_identity)
{
struct remote_endpoint_info *info;
struct ipv4_ct_tuple tuple = {};
struct endpoint_info *ep;
void *data, *data_end;
struct iphdr *ip4;
int l4_off;
__u32 secctx;
if (!revalidate_data(skb, &data, &data_end, &ip4))
return DROP_INVALID;
l4_off = ETH_HLEN + ipv4_hdrlen(ip4);
secctx = derive_ipv4_sec_ctx(skb, ip4);
tuple.nexthdr = ip4->protocol;
/* Packets from the proxy will already have a real identity. */
if (identity_is_reserved(src_identity)) {
info = ipcache_lookup4(&IPCACHE_MAP, ip4->saddr, V4_CACHE_KEY_LEN);
if (info != NULL) {
__u32 sec_label = info->sec_label;
if (sec_label) {
/* When SNAT is enabled on traffic ingressing
* into Cilium, all traffic from the world will
* have a source IP of the host. It will only
* actually be from the host if "src_identity"
* (passed into this function) reports the src
* as the host. So we can ignore the ipcache
* if it reports the source as HOST_ID.
*/
#ifndef ENABLE_EXTRA_HOST_DEV
if (sec_label != HOST_ID)
#endif
src_identity = sec_label;
}
}
cilium_dbg(skb, info ? DBG_IP_ID_MAP_SUCCEED4 : DBG_IP_ID_MAP_FAILED4,
ip4->saddr, src_identity);
}
secctx = finalize_sec_ctx(secctx, src_identity);
#ifdef FROM_HOST
if (1) {
int ret;
secctx = src_identity;
ret = reverse_proxy(skb, l4_off, ip4, tuple.nexthdr);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
/* If we are attached to cilium_host at egress, this will
* rewrite the destination mac address to the MAC of cilium_net */
ret = rewrite_dmac_to_host(skb);
/* DIRECT PACKET READ INVALID */
if (IS_ERR(ret))
return ret;
}
if (!revalidate_data(skb, &data, &data_end, &ip4))
return DROP_INVALID;
#endif
/* Lookup IPv4 address in list of local endpoints and host IPs */
if ((ep = lookup_ip4_endpoint(ip4)) != NULL) {
/* Let through packets to the node-ip so they are
* processed by the local ip stack */
if (ep->flags & ENDPOINT_F_HOST)
#ifdef HOST_REDIRECT_TO_INGRESS
/* This is required for L7 proxy to send packets to the host. */
return redirect(HOST_IFINDEX, BPF_F_INGRESS);
#else
return TC_ACT_OK;
#endif
return ipv4_local_delivery(skb, ETH_HLEN, l4_off, secctx, ip4, ep, METRIC_INGRESS);
}
#ifdef ENCAP_IFINDEX
info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN);
if (info != NULL && info->tunnel_endpoint != 0) {
int ret = encap_and_redirect_with_nodeid(skb, info->tunnel_endpoint,
info->key,
secctx, TRACE_PAYLOAD_LEN);
if (ret == IPSEC_ENDPOINT)
return TC_ACT_OK;
else
return ret;
} else {
/* IPv4 lookup key: daddr & IPV4_MASK */
struct endpoint_key key = {};
int ret;
key.ip4 = ip4->daddr & IPV4_MASK;
key.family = ENDPOINT_KEY_IPV4;
cilium_dbg(skb, DBG_NETDEV_ENCAP4, key.ip4, secctx);
ret = encap_and_redirect(skb, &key, secctx, TRACE_PAYLOAD_LEN, true);
if (ret == IPSEC_ENDPOINT)
return TC_ACT_OK;
else if (ret != DROP_NO_TUNNEL_ENDPOINT)
return ret;
}
#endif
#ifdef HOST_REDIRECT_TO_INGRESS
return redirect(HOST_IFINDEX, BPF_F_INGRESS);
#else
#ifdef FROM_HOST
/* The destination IP address could not be associated with a local
* endpoint or a tunnel destination. If it is destined to an IP in
* the local range, then we can't route it back to the host as it
* will create a routing loop. Drop it. */
if ((ip4->daddr & IPV4_MASK) == (IPV4_GATEWAY & IPV4_MASK))
return DROP_NON_LOCAL;
#endif
return TC_ACT_OK;
#endif
}
#define CB_SRC_IDENTITY 0
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_LXC) int tail_handle_ipv4(struct __sk_buff *skb)
{
__u32 proxy_identity = skb->cb[CB_SRC_IDENTITY];
int ret;
skb->cb[CB_SRC_IDENTITY] = 0;
ret = handle_ipv4(skb, proxy_identity);
if (IS_ERR(ret))
return send_drop_notify_error(skb, ret, TC_ACT_SHOT, METRIC_INGRESS);
return ret;
}
#endif /* ENABLE_IPV4 */
#ifdef FROM_HOST
static inline bool __inline__ handle_identity_from_host(struct __sk_buff *skb, __u32 *identity)
{
__u32 magic = skb->mark & MARK_MAGIC_HOST_MASK;
bool from_proxy = false;
/* Packets from the ingress proxy must skip the proxy when the
* destination endpoint evaluates the policy. As the packet
* would loop otherwise. */
if (magic == MARK_MAGIC_PROXY_INGRESS) {
*identity = get_identity(skb);
skb->tc_index |= TC_INDEX_F_SKIP_PROXY;
from_proxy = true;
} else if (magic == MARK_MAGIC_PROXY_EGRESS) {
*identity = get_identity(skb);
from_proxy = true;
} else if (magic == MARK_MAGIC_HOST) {
*identity = HOST_ID;
} else {
*identity = WORLD_ID;
}
/* Reset packet mark to avoid hitting routing rules again */
skb->mark = 0;
return from_proxy;
}
#endif
__section("from-netdev")
int from_netdev(struct __sk_buff *skb)
{
__u32 identity = 0;
int ret;
#if defined ENABLE_IPSEC && defined ENCAP_IFINDEX
if (1) {
__u32 magic = skb->mark & MARK_MAGIC_HOST_MASK;
if (magic == MARK_MAGIC_ENCRYPT) {
__u32 seclabel, tunnel_endpoint = 0;
seclabel = get_identity(skb);
tunnel_endpoint = skb->cb[4];
skb->mark = 0;
bpf_clear_cb(skb);
return __encap_and_redirect_with_nodeid(skb, tunnel_endpoint, seclabel, TRACE_PAYLOAD_LEN);
}
}
#endif
bpf_clear_cb(skb);
#ifdef FROM_HOST
if (1) {
#ifdef HOST_REDIRECT_TO_INGRESS
if (skb->protocol == bpf_htons(ETH_P_ARP)) {
union macaddr mac = HOST_IFINDEX_MAC;
return arp_respond(skb, &mac, BPF_F_INGRESS);
}
#endif
int trace = TRACE_FROM_HOST;
bool from_proxy;
from_proxy = handle_identity_from_host(skb, &identity);
if (from_proxy)
trace = TRACE_FROM_PROXY;
send_trace_notify(skb, trace, identity, 0, 0,
skb->ingress_ifindex, 0, TRACE_PAYLOAD_LEN);
}
#else
send_trace_notify(skb, TRACE_FROM_STACK, 0, 0, 0, skb->ingress_ifindex,
0, TRACE_PAYLOAD_LEN);
#endif
switch (skb->protocol) {
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
/* This is considered the fast path, no tail call */
ret = handle_ipv6(skb, identity);
/* We should only be seeing an error here for packets which have
* been targetting an endpoint managed by us. */
if (IS_ERR(ret))
return send_drop_notify_error(skb, ret, TC_ACT_SHOT, METRIC_INGRESS);
break;
#endif
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
skb->cb[CB_SRC_IDENTITY] = identity;
ep_tail_call(skb, CILIUM_CALL_IPV4_FROM_LXC);
/* We are not returning an error here to always allow traffic to
* the stack in case maps have become unavailable.
*
* Note: Since drop notification requires a tail call as well,
* this notification is unlikely to succeed. */
return send_drop_notify_error(skb, DROP_MISSED_TAIL_CALL,
TC_ACT_OK, METRIC_INGRESS);
#endif
default:
/* Pass unknown traffic to the stack */
ret = TC_ACT_OK;
}
return ret;
}
BPF_LICENSE("GPL");