Skip to content

Commit

Permalink
cilium: ipsec, support rolling updates
Browse files Browse the repository at this point in the history
Currently, rolling updates may get stuck due to a time period between
when some set of nodes have started with encryption enabled and another
set exist without encryption enabled. At this point these two sets of
nodes can only communicate from non-encrypted to encrypted set. The
set with encryption enabled will encrypt traffic that will in turn be
dropped by the set that has yet to enable encryption.

To resolve we make encryption a property of the endpoint id. Keeping
the key identifier with the endpoint to inform cilium which key
should be used during an upgrade. Because we use the mark space
to identify keys we limit the key space to two keys currently.

After this key secrets will need to be updated to include an id
field as follows,

  keys: "1 hmac(sha256) 0123456789abcdef0123456789abcdef cbc(aes) 0123456789abcdef0123456789abcdef"

Where '1' is the id here. IDs are enforced to be less than 16. This
is a bit arbitrary we could go as high as 256 without hitting mark
bit limits. However, 16 feels sufficient and we cant take bits back
later so start low and bump up if needed. The id '0' is a special id
and should not be used it represents the absence of keys in the
datapath.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
  • Loading branch information
jrfastab authored and ianvernon committed Mar 22, 2019
1 parent 506ccb8 commit b698972
Show file tree
Hide file tree
Showing 28 changed files with 261 additions and 151 deletions.
4 changes: 2 additions & 2 deletions bpf/bpf_ipsec.c
Expand Up @@ -27,8 +27,8 @@
__section("from-netdev")
int from_netdev(struct __sk_buff *skb)
{
if (skb->cb[0] == MARK_MAGIC_ENCRYPT) {
skb->mark = MARK_MAGIC_ENCRYPT;
if ((skb->cb[0] & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_ENCRYPT) {
skb->mark = skb->cb[0];
set_identity(skb, skb->cb[1]);
}
return TC_ACT_OK;
Expand Down
7 changes: 6 additions & 1 deletion bpf/bpf_lxc.c
Expand Up @@ -146,6 +146,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
void *data, *data_end;
union v6addr *daddr, orig_dip;
__u32 tunnel_endpoint = 0;
__u8 encrypt_key = 0;
__u32 monitor = 0;

if (unlikely(!is_valid_lxc_src_ip(ip6)))
Expand Down Expand Up @@ -221,6 +222,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
if (info != NULL && info->sec_label) {
*dstID = info->sec_label;
tunnel_endpoint = info->tunnel_endpoint;
encrypt_key = info->key;
} else {
*dstID = WORLD_ID;
}
Expand Down Expand Up @@ -331,7 +333,7 @@ static inline int ipv6_l3_from_lxc(struct __sk_buff *skb,
/* The packet goes to a peer not managed by this agent instance */
#ifdef ENCAP_IFINDEX
if (tunnel_endpoint) {
ret = encap_and_redirect_with_nodeid_from_lxc(skb, tunnel_endpoint, SECLABEL, monitor);
ret = encap_and_redirect_with_nodeid_from_lxc(skb, tunnel_endpoint, encrypt_key, SECLABEL, monitor);
/* If not redirected noteable due to IPSEC then pass up to stack
* for further processing.
*/
Expand Down Expand Up @@ -475,6 +477,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
struct ct_state ct_state = {};
__be32 orig_dip;
__u32 tunnel_endpoint = 0;
__u8 encrypt_key = 0;
__u32 monitor = 0;

if (!revalidate_data(skb, &data, &data_end, &ip4))
Expand Down Expand Up @@ -537,6 +540,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
if (info != NULL && info->sec_label) {
*dstID = info->sec_label;
tunnel_endpoint = info->tunnel_endpoint;
encrypt_key = info->key;
} else {
*dstID = WORLD_ID;
}
Expand Down Expand Up @@ -649,6 +653,7 @@ static inline int handle_ipv4_from_lxc(struct __sk_buff *skb, __u32 *dstID)
#ifdef ENCAP_IFINDEX
if (tunnel_endpoint) {
int ret = encap_and_redirect_with_nodeid_from_lxc(skb, tunnel_endpoint,
encrypt_key,
SECLABEL, monitor);
/* If not redirected noteably due to IPSEC then pass up to stack
* for further processing.
Expand Down
2 changes: 2 additions & 0 deletions bpf/bpf_netdev.c
Expand Up @@ -242,6 +242,7 @@ static inline int handle_ipv6(struct __sk_buff *skb, __u32 src_identity)
info = ipcache_lookup6(&IPCACHE_MAP, dst, V6_CACHE_KEY_LEN);
if (info != NULL && info->tunnel_endpoint != 0) {
int ret = encap_and_redirect_with_nodeid(skb, info->tunnel_endpoint,
info->key,
secctx, TRACE_PAYLOAD_LEN);

/* If IPSEC is needed recirc through ingress to use xfrm stack
Expand Down Expand Up @@ -443,6 +444,7 @@ static inline int handle_ipv4(struct __sk_buff *skb, __u32 src_identity)
info = ipcache_lookup4(&IPCACHE_MAP, ip4->daddr, V4_CACHE_KEY_LEN);
if (info != NULL && info->tunnel_endpoint != 0) {
int ret = encap_and_redirect_with_nodeid(skb, info->tunnel_endpoint,
info->key,
secctx, TRACE_PAYLOAD_LEN);

if (ret == IPSEC_ENDPOINT)
Expand Down
3 changes: 3 additions & 0 deletions bpf/bpf_overlay.c
Expand Up @@ -64,6 +64,8 @@ static inline int handle_ipv6(struct __sk_buff *skb)
*/
if (ip6->nexthdr != IPPROTO_ESP)
goto not_esp;

/* Decrypt "key" is determined by SPI */
skb->mark = MARK_MAGIC_DECRYPT;
set_identity(skb, key.tunnel_id);
/* To IPSec stack on cilium_vxlan we are going to pass
Expand Down Expand Up @@ -148,6 +150,7 @@ static inline int handle_ipv4(struct __sk_buff *skb)
*/
if (ip4->protocol != IPPROTO_ESP)
goto not_esp;
/* Decrypt "key" is determined by SPI */
skb->mark = MARK_MAGIC_DECRYPT;
set_identity(skb, key.tunnel_id);
/* To IPSec stack on cilium_vxlan we are going to pass
Expand Down
42 changes: 33 additions & 9 deletions bpf/lib/common.h
Expand Up @@ -162,7 +162,7 @@ struct endpoint_key {
union v6addr ip6;
};
__u8 family;
__u8 pad4;
__u8 key;
__u16 pad5;
} __attribute__((packed));

Expand All @@ -182,6 +182,7 @@ struct endpoint_info {
struct remote_endpoint_info {
__u32 sec_label;
__u32 tunnel_endpoint;
__u8 key;
};

struct policy_key {
Expand Down Expand Up @@ -298,17 +299,24 @@ enum {
* packets security identity. The lower/upper halves are swapped to recover
* the identity.
*
* The 4 bits at 0XF00 provide
* The 4 bits at 0X0F00 provide
* - the magic marker values which indicate whether the packet is coming from
* an ingress or egress proxy, a local process and its current encryption
* status.
*
* The 4 bits at 0xF000 provide
* - the key index to use for encryption when multiple keys are in-flight.
* In the IPsec case this becomes the SPI on the wire.
*/
#define MARK_MAGIC_HOST_MASK 0xF00
#define MARK_MAGIC_PROXY_INGRESS 0xA00
#define MARK_MAGIC_PROXY_EGRESS 0xB00
#define MARK_MAGIC_HOST 0xC00
#define MARK_MAGIC_DECRYPT 0xD00
#define MARK_MAGIC_ENCRYPT 0xE00
#define MARK_MAGIC_HOST_MASK 0x0F00
#define MARK_MAGIC_PROXY_INGRESS 0x0A00
#define MARK_MAGIC_PROXY_EGRESS 0x0B00
#define MARK_MAGIC_HOST 0x0C00
#define MARK_MAGIC_DECRYPT 0x0D00
#define MARK_MAGIC_ENCRYPT 0x0E00

#define MARK_MAGIC_KEY_ID 0xF000
#define MARK_MAGIC_KEY_MASK 0xFF00

/**
* get_identity - returns source identity from the mark field
Expand All @@ -323,10 +331,26 @@ static inline int __inline__ get_identity(struct __sk_buff *skb)
*/
static inline void __inline__ set_identity(struct __sk_buff *skb, __u32 identity)
{
skb->mark = skb->mark & MARK_MAGIC_HOST_MASK;
skb->mark = skb->mark & MARK_MAGIC_KEY_MASK;
skb->mark |= ((identity & 0xFFFF) << 16) | ((identity & 0xFF0000) >> 16);
}

/**
* or_encrypt_key - mask and shift key into encryption format
*/
static inline __u32 __inline__ or_encrypt_key(__u8 key)
{
return (((__u32)key & 0x0F) << 12) | MARK_MAGIC_ENCRYPT;
}

/**
* set_encrypt_key - pushes 8 bit key and encryption marker into skb mark value.
*/
static inline void __inline__ set_encrypt_key(struct __sk_buff *skb, __u8 key)
{
skb->mark = or_encrypt_key(key);
}

/*
* skb->tc_index uses
*
Expand Down
46 changes: 27 additions & 19 deletions bpf/lib/encap.h
Expand Up @@ -24,7 +24,7 @@
#ifdef ENCAP_IFINDEX
#ifdef ENABLE_IPSEC
static inline int __inline__
enacap_and_redirect_nomark_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint,
enacap_and_redirect_nomark_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint, __u8 key,
__u32 seclabel, __u32 monitor)
{
/* Traffic from local host in tunnel mode will be passed to
Expand All @@ -40,14 +40,14 @@ enacap_and_redirect_nomark_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint,
* in both cases because xfrm layer would overwrite them. We
* use cb[4] here so it doesn't need to be reset by bpf_ipsec.
*/
skb->cb[0] = MARK_MAGIC_ENCRYPT;
skb->cb[0] = or_encrypt_key(key);
skb->cb[1] = seclabel;
skb->cb[4] = tunnel_endpoint;
return IPSEC_ENDPOINT;
}

static inline int __inline__
encap_and_redirect_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint,
encap_and_redirect_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint, __u8 key,
__u32 seclabel, __u32 monitor)
{
/* IPSec is performed by the stack on any packets with the
Expand All @@ -57,7 +57,7 @@ encap_and_redirect_ipsec(struct __sk_buff *skb, __u32 tunnel_endpoint,
* label is stashed in the mark and extracted in bpf_netdev
* to send skb onto tunnel for encap.
*/
skb->mark = MARK_MAGIC_ENCRYPT;
set_encrypt_key(skb, key);
set_identity(skb, seclabel);
skb->cb[4] = tunnel_endpoint;
return IPSEC_ENDPOINT;
Expand Down Expand Up @@ -96,24 +96,24 @@ __encap_and_redirect_with_nodeid(struct __sk_buff *skb, __u32 tunnel_endpoint,
*/
static inline int __inline__
encap_and_redirect_with_nodeid(struct __sk_buff *skb, __u32 tunnel_endpoint,
__u32 seclabel, __u32 monitor)
__u8 key, __u32 seclabel, __u32 monitor)
{
#ifdef ENABLE_IPSEC
return enacap_and_redirect_nomark_ipsec(skb, tunnel_endpoint, seclabel, monitor);
#else
return __encap_and_redirect_with_nodeid(skb, tunnel_endpoint, seclabel, monitor);
if (key)
return enacap_and_redirect_nomark_ipsec(skb, tunnel_endpoint, key, seclabel, monitor);
#endif
return __encap_and_redirect_with_nodeid(skb, tunnel_endpoint, seclabel, monitor);
}

static inline int __inline__
encap_and_redirect_with_nodeid_from_lxc(struct __sk_buff *skb, __u32 tunnel_endpoint,
encap_and_redirect_with_nodeid_from_lxc(struct __sk_buff *skb, __u32 tunnel_endpoint, __u8 key,
__u32 seclabel, __u32 monitor)
{
#ifdef ENABLE_IPSEC
return encap_and_redirect_ipsec(skb, tunnel_endpoint, seclabel, monitor);
#else
return __encap_and_redirect_with_nodeid(skb, tunnel_endpoint, seclabel, monitor);
if (key)
return encap_and_redirect_ipsec(skb, tunnel_endpoint, key, seclabel, monitor);
#endif
return __encap_and_redirect_with_nodeid(skb, tunnel_endpoint, seclabel, monitor);
}

/* encap_and_redirect based on ENABLE_IPSEC flag and from_host bool will decide
Expand All @@ -137,14 +137,22 @@ encap_and_redirect(struct __sk_buff *skb, struct endpoint_key *k,
}

#ifdef ENABLE_IPSEC
if (from_host)
return enacap_and_redirect_nomark_ipsec(skb, tunnel->ip4,
seclabel, monitor);
else
return encap_and_redirect_ipsec(skb, tunnel->ip4, seclabel, monitor);
#else
return __encap_and_redirect_with_nodeid(skb, tunnel->ip4, seclabel, monitor);
if (tunnel->key) {
if (from_host)
return enacap_and_redirect_nomark_ipsec(skb,
tunnel->ip4,
tunnel->key,
seclabel,
monitor);
else
return encap_and_redirect_ipsec(skb,
tunnel->ip4,
tunnel->key,
seclabel,
monitor);
}
#endif
return __encap_and_redirect_with_nodeid(skb, tunnel->ip4, seclabel, monitor);
}
#endif /* ENCAP_IFINDEX */
#endif /* __LIB_ENCAP_H_ */
2 changes: 1 addition & 1 deletion daemon/bpf.sha
@@ -1,2 +1,2 @@
GO_BINDATA_SHA1SUM=af1ee3c89109c45ca4e8546e542ed854ce4fcc8f
GO_BINDATA_SHA1SUM=c2043b709a8ebc82155515aa31ac4637162eb3ec
BPF_FILES=../bpf/COPYING ../bpf/Makefile ../bpf/Makefile.bpf ../bpf/bpf_alignchecker.c ../bpf/bpf_features.h ../bpf/bpf_ipsec.c ../bpf/bpf_lb.c ../bpf/bpf_lxc.c ../bpf/bpf_netdev.c ../bpf/bpf_overlay.c ../bpf/bpf_xdp.c ../bpf/cilium-map-migrate.c ../bpf/filter_config.h ../bpf/include/bpf/api.h ../bpf/include/elf/elf.h ../bpf/include/elf/gelf.h ../bpf/include/elf/libelf.h ../bpf/include/iproute2/bpf_elf.h ../bpf/include/linux/bpf.h ../bpf/include/linux/bpf_common.h ../bpf/include/linux/byteorder.h ../bpf/include/linux/byteorder/big_endian.h ../bpf/include/linux/byteorder/little_endian.h ../bpf/include/linux/icmp.h ../bpf/include/linux/icmpv6.h ../bpf/include/linux/if_arp.h ../bpf/include/linux/if_ether.h ../bpf/include/linux/if_packet.h ../bpf/include/linux/in.h ../bpf/include/linux/in6.h ../bpf/include/linux/ioctl.h ../bpf/include/linux/ip.h ../bpf/include/linux/ipv6.h ../bpf/include/linux/perf_event.h ../bpf/include/linux/swab.h ../bpf/include/linux/tcp.h ../bpf/include/linux/type_mapper.h ../bpf/include/linux/udp.h ../bpf/init.sh ../bpf/lib/arp.h ../bpf/lib/common.h ../bpf/lib/config.h ../bpf/lib/conntrack.h ../bpf/lib/csum.h ../bpf/lib/dbg.h ../bpf/lib/drop.h ../bpf/lib/encap.h ../bpf/lib/eps.h ../bpf/lib/eth.h ../bpf/lib/events.h ../bpf/lib/icmp6.h ../bpf/lib/ipv4.h ../bpf/lib/ipv6.h ../bpf/lib/l3.h ../bpf/lib/l4.h ../bpf/lib/lb.h ../bpf/lib/lxc.h ../bpf/lib/maps.h ../bpf/lib/metrics.h ../bpf/lib/nat46.h ../bpf/lib/policy.h ../bpf/lib/tailcall.h ../bpf/lib/trace.h ../bpf/lib/utils.h ../bpf/lib/xdp.h ../bpf/lxc_config.h ../bpf/netdev_config.h ../bpf/node_config.h ../bpf/probes/raw_change_tail.t ../bpf/probes/raw_insn.h ../bpf/probes/raw_invalidate_hash.t ../bpf/probes/raw_lpm_map.t ../bpf/probes/raw_lru_map.t ../bpf/probes/raw_main.c ../bpf/probes/raw_map_val_adj.t ../bpf/probes/raw_mark_map_val.t ../bpf/run_probes.sh ../bpf/sockops/Makefile ../bpf/sockops/bpf_redir.c ../bpf/sockops/bpf_sockops.c ../bpf/sockops/bpf_sockops.h ../bpf/sockops/sockops_config.h ../bpf/spawn_netns.sh
7 changes: 5 additions & 2 deletions daemon/daemon.go
Expand Up @@ -792,6 +792,7 @@ func (d *Daemon) syncLXCMap() error {
}

for _, ipIDPair := range specialIdentities {
hostKey := node.GetIPsecKeyIdentity()
isHost := ipIDPair.ID == identity.ReservedIdentityHost
if isHost {
added, err := lxcmap.SyncHostEntry(ipIDPair.IP)
Expand All @@ -807,7 +808,7 @@ func (d *Daemon) syncLXCMap() error {

// Upsert will not propagate (reserved:foo->ID) mappings across the cluster,
// and we specifically don't want to do so.
ipcache.IPIdentityCache.Upsert(ipIDPair.PrefixString(), nil, ipcache.Identity{
ipcache.IPIdentityCache.Upsert(ipIDPair.PrefixString(), nil, hostKey, ipcache.Identity{
ID: ipIDPair.ID,
Source: ipcache.FromAgentLocal,
})
Expand Down Expand Up @@ -926,14 +927,16 @@ func NewDaemon(dp datapath.Datapath) (*Daemon, *endpointRestoreState, error) {
mtuConfig := mtu.NewConfiguration(option.Config.Tunnel != option.TunnelDisabled, option.Config.MTU)

if option.Config.EnableIPSec {
if err := ipsec.LoadIPSecKeysFile(option.Config.IPSecKeyFile); err != nil {
spi, err := ipsec.LoadIPSecKeysFile(option.Config.IPSecKeyFile)
if err != nil {
return nil, nil, err
}
if option.Config.EnableIPv6 {
if err := ipsec.EnableIPv6Forwarding(); err != nil {
return nil, nil, err
}
}
node.SetIPsecKeyIdentity(spi)
}

nodeMngr, err := nodemanager.NewManager("all", dp.Node())
Expand Down
7 changes: 5 additions & 2 deletions daemon/k8s_watcher.go
Expand Up @@ -1511,10 +1511,12 @@ func (d *Daemon) updatePodHostIP(pod *types.Pod) (bool, error) {
return true, fmt.Errorf("no/invalid PodIP: %s", pod.StatusPodIP)
}

hostKey := node.GetIPsecKeyIdentity()

// Initial mapping of podIP <-> hostIP <-> identity. The mapping is
// later updated once the allocator has determined the real identity.
// If the endpoint remains unmanaged, the identity remains untouched.
selfOwned := ipcache.IPIdentityCache.Upsert(pod.StatusPodIP, hostIP, ipcache.Identity{
selfOwned := ipcache.IPIdentityCache.Upsert(pod.StatusPodIP, hostIP, hostKey, ipcache.Identity{
ID: identity.ReservedIdentityUnmanaged,
Source: ipcache.FromKubernetes,
})
Expand Down Expand Up @@ -1741,7 +1743,8 @@ func (d *Daemon) updateK8sNodeTunneling(k8sNodeOld, k8sNodeNew *types.Node) erro
}
}

selfOwned := ipcache.IPIdentityCache.Upsert(ciliumIPStrNew, hostIPNew, ipcache.Identity{
hostKey := node.GetIPsecKeyIdentity()
selfOwned := ipcache.IPIdentityCache.Upsert(ciliumIPStrNew, hostIPNew, hostKey, ipcache.Identity{
ID: identity.ReservedIdentityHost,
Source: ipcache.FromKubernetes,
})
Expand Down
6 changes: 4 additions & 2 deletions pkg/bpf/endpoint.go
Expand Up @@ -15,6 +15,7 @@
package bpf

import (
"fmt"
"net"
"unsafe"

Expand All @@ -34,7 +35,7 @@ type EndpointKey struct {
// represents both IPv6 and IPv4 (in the lowest four bytes)
IP types.IPv6 `align:"$union0"`
Family uint8 `align:"family"`
Pad1 uint8 `align:"pad4"`
Key uint8 `align:"key"`
Pad2 uint16 `align:"pad5"`
}

Expand All @@ -57,6 +58,7 @@ func NewEndpointKey(ip net.IP) EndpointKey {
result.Family = EndpointKeyIPv6
copy(result.IP[:], ip)
}
result.Key = 0

return result
}
Expand All @@ -75,7 +77,7 @@ func (k EndpointKey) ToIP() net.IP {
// String provides a string representation of the EndpointKey.
func (k EndpointKey) String() string {
if ip := k.ToIP(); ip != nil {
return ip.String()
return fmt.Sprintf("%s:%d", ip.String(), k.Key)
}
return "nil"
}
3 changes: 2 additions & 1 deletion pkg/datapath/ipcache/listener.go
Expand Up @@ -77,7 +77,7 @@ func NewListener(d datapath) *BPFListener {
// IP->ID mapping will replace any existing contents; knowledge of the old pair
// is not required to upsert the new pair.
func (l *BPFListener) OnIPIdentityCacheChange(modType ipcache.CacheModification, cidr net.IPNet,
oldHostIP, newHostIP net.IP, oldID *identity.NumericIdentity, newID identity.NumericIdentity) {
oldHostIP, newHostIP net.IP, oldID *identity.NumericIdentity, newID identity.NumericIdentity, encryptKey uint8) {
scopedLog := log.WithFields(logrus.Fields{
logfields.IPAddr: cidr,
logfields.Identity: newID,
Expand All @@ -98,6 +98,7 @@ func (l *BPFListener) OnIPIdentityCacheChange(modType ipcache.CacheModification,
case ipcache.Upsert:
value := ipcacheMap.RemoteEndpointInfo{
SecurityIdentity: uint32(newID),
Key: encryptKey,
}

if newHostIP != nil {
Expand Down

0 comments on commit b698972

Please sign in to comment.