Skip to content

Commit

Permalink
pkt reassemble: fix kernel panic for ovs reassemble
Browse files Browse the repository at this point in the history
Ovs and kernel stack would add frag_queue to same netns_frags list.
As result, ovs and kernel may access the fraq_queue without correct
lock. Also the struct ipq may be different on kernel(older than 4.3),
which leads to invalid pointer access.

The fix creates specific netns_frags for ovs.

Signed-off-by: wangzhike <wangzhike@jd.com>
  • Loading branch information
wangzk320 authored and gvrose8192 committed Jun 23, 2017
1 parent eca6bb3 commit 875545f
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 123 deletions.
22 changes: 13 additions & 9 deletions datapath/datapath.c
Expand Up @@ -2297,6 +2297,8 @@ static int __net_init ovs_init_net(struct net *net)
INIT_LIST_HEAD(&ovs_net->dps);
INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
ovs_ct_init(net);
ovs_netns_frags_init(net);
ovs_netns_frags6_init(net);
return 0;
}

Expand Down Expand Up @@ -2332,6 +2334,8 @@ static void __net_exit ovs_exit_net(struct net *dnet)
struct net *net;
LIST_HEAD(head);

ovs_netns_frags6_exit(dnet);
ovs_netns_frags_exit(dnet);
ovs_ct_exit(dnet);
ovs_lock();
list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
Expand Down Expand Up @@ -2368,13 +2372,9 @@ static int __init dp_init(void)

pr_info("Open vSwitch switching datapath %s\n", VERSION);

err = compat_init();
if (err)
goto error;

err = action_fifos_init();
if (err)
goto error_compat_exit;
goto error;

err = ovs_internal_dev_rtnl_link_register();
if (err)
Expand All @@ -2392,10 +2392,14 @@ static int __init dp_init(void)
if (err)
goto error_vport_exit;

err = register_netdevice_notifier(&ovs_dp_device_notifier);
err = compat_init();
if (err)
goto error_netns_exit;

err = register_netdevice_notifier(&ovs_dp_device_notifier);
if (err)
goto error_compat_exit;

err = ovs_netdev_init();
if (err)
goto error_unreg_notifier;
Expand All @@ -2410,6 +2414,8 @@ static int __init dp_init(void)
ovs_netdev_exit();
error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_compat_exit:
compat_exit();
error_netns_exit:
unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
Expand All @@ -2420,8 +2426,6 @@ static int __init dp_init(void)
ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:
action_fifos_exit();
error_compat_exit:
compat_exit();
error:
return err;
}
Expand All @@ -2431,13 +2435,13 @@ static void dp_cleanup(void)
dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
ovs_netdev_exit();
unregister_netdevice_notifier(&ovs_dp_device_notifier);
compat_exit();
unregister_pernet_device(&ovs_net_ops);
rcu_barrier();
ovs_vport_exit();
ovs_flow_exit();
ovs_internal_dev_rtnl_link_unregister();
action_fifos_exit();
compat_exit();
}

module_init(dp_init);
Expand Down
6 changes: 6 additions & 0 deletions datapath/datapath.h
Expand Up @@ -141,6 +141,12 @@ struct ovs_net {

/* Module reference for configuring conntrack. */
bool xt_label;

#ifdef HAVE_INET_FRAG_LRU_MOVE
struct net *net;
struct netns_frags ipv4_frags;
struct netns_frags nf_frags;
#endif
};

extern unsigned int ovs_net_id;
Expand Down
18 changes: 0 additions & 18 deletions datapath/linux/compat/include/net/inet_frag.h
Expand Up @@ -52,22 +52,4 @@ static inline int rpl_inet_frags_init(struct inet_frags *frags)
#define inet_frags_init rpl_inet_frags_init
#endif

#ifndef HAVE_CORRECT_MRU_HANDLING
/* We reuse the upstream inet_fragment.c common code for managing fragment
* stores, However we actually store the fragments within our own 'inet_frags'
* structures (in {ip_fragment,nf_conntrack_reasm}.c). When unloading the OVS
* kernel module, we need to flush all of the remaining fragments from these
* caches, or else we will panic with the following sequence of events:
*
* 1) A fragment for a packet arrives and is cached in inet_frags. This
* starts a timer to ensure the fragment does not hang around forever.
* 2) openvswitch module is unloaded.
* 3) The timer for the fragment fires, calling into backported OVS code
* to free the fragment.
* 4) BUG: unable to handle kernel paging request at ffffffffc03c01e0
*/
void rpl_inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
#define inet_frags_exit_net rpl_inet_frags_exit_net
#endif

#endif /* inet_frag.h */
4 changes: 4 additions & 0 deletions datapath/linux/compat/include/net/ip.h
Expand Up @@ -97,6 +97,8 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
#define ip_defrag rpl_ip_defrag
int __init rpl_ipfrag_init(void);
void rpl_ipfrag_fini(void);
void ovs_netns_frags_init(struct net *net);
void ovs_netns_frags_exit(struct net *net);

#else /* HAVE_CORRECT_MRU_HANDLING */

Expand Down Expand Up @@ -131,6 +133,8 @@ static inline int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
* compat_{in,ex}it() can be no-ops. */
static inline int rpl_ipfrag_init(void) { return 0; }
static inline void rpl_ipfrag_fini(void) { }
static inline void ovs_netns_frags_init(struct net *net) { }
static inline void ovs_netns_frags_exit(struct net *net) { }
#endif /* HAVE_CORRECT_MRU_HANDLING */

#define ipfrag_init rpl_ipfrag_init
Expand Down
Expand Up @@ -28,9 +28,13 @@ int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
*/
int __init rpl_nf_ct_frag6_init(void);
void rpl_nf_ct_frag6_cleanup(void);
void ovs_netns_frags6_init(struct net *net);
void ovs_netns_frags6_exit(struct net *net);
#else /* !OVS_NF_DEFRAG6_BACKPORT */
static inline int __init rpl_nf_ct_frag6_init(void) { return 0; }
static inline void rpl_nf_ct_frag6_cleanup(void) { }
static inline void ovs_netns_frags6_init(struct net *net) { }
static inline void ovs_netns_frags6_exit(struct net *net) { }
#endif /* OVS_NF_DEFRAG6_BACKPORT */
#define nf_ct_frag6_init rpl_nf_ct_frag6_init
#define nf_ct_frag6_cleanup rpl_nf_ct_frag6_cleanup
Expand Down
83 changes: 0 additions & 83 deletions datapath/linux/compat/inet_fragment.c
Expand Up @@ -27,88 +27,5 @@
#include <net/inet_frag.h>
#include <net/inet_ecn.h>

#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
{
return q->net->low_thresh == 0 ||
frag_mem_limit(q->net) >= q->net->low_thresh;
}

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
struct inet_frag_queue *fq;
struct hlist_node *n;
unsigned int evicted = 0;
HLIST_HEAD(expired);

spin_lock(&hb->chain_lock);

hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
if (!inet_fragq_should_evict(fq))
continue;

if (!del_timer(&fq->timer))
continue;

#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
hlist_add_head(&fq->list_evictor, &expired);
#else
hlist_del(&fq->list);
hlist_add_head(&fq->list, &expired);
#endif
++evicted;
}

spin_unlock(&hb->chain_lock);

#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
#else
hlist_for_each_entry_safe(fq, n, &expired, list)
#endif
f->frag_expire((unsigned long) fq);

return evicted;
}

void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
int thresh = nf->low_thresh;
unsigned int seq;
int i;

nf->low_thresh = 0;

evict_again:
local_bh_disable();
seq = read_seqbegin(&f->rnd_seqlock);

for (i = 0; i < INETFRAGS_HASHSZ ; i++)
inet_evict_bucket(f, &f->hash[i]);

local_bh_enable();
cond_resched();

if (read_seqretry(&f->rnd_seqlock, seq) ||
percpu_counter_sum(&nf->mem))
goto evict_again;

nf->low_thresh = thresh;
}
#else /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
int thresh = nf->low_thresh;

nf->low_thresh = 0;

local_bh_disable();
inet_frag_evictor(nf, f, true);
local_bh_enable();

nf->low_thresh = thresh;
}
#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */

#endif /* !HAVE_CORRECT_MRU_HANDLING */
66 changes: 57 additions & 9 deletions datapath/linux/compat/ip_fragment.c
Expand Up @@ -57,6 +57,8 @@
#include <net/inet_ecn.h>
#include <net/vrf.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netns/generic.h>
#include "datapath.h"

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
Expand Down Expand Up @@ -107,6 +109,51 @@ struct ip4_create_arg {
int vif;
};

static struct netns_frags *get_netns_frags_from_net(struct net *net)
{
#ifdef HAVE_INET_FRAG_LRU_MOVE
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
return &(ovs_net->ipv4_frags);
#else
return &(net->ipv4.frags);
#endif
}

static struct net *get_net_from_netns_frags(struct netns_frags *frags)
{
struct net *net;
#ifdef HAVE_INET_FRAG_LRU_MOVE
struct ovs_net *ovs_net;

ovs_net = container_of(frags, struct ovs_net, ipv4_frags);
net = ovs_net->net;
#else
net = container_of(frags, struct net, ipv4.frags);
#endif
return net;
}

void ovs_netns_frags_init(struct net *net)
{
#ifdef HAVE_INET_FRAG_LRU_MOVE
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);

ovs_net->ipv4_frags.high_thresh = 4 * 1024 * 1024;
ovs_net->ipv4_frags.low_thresh = 3 * 1024 * 1024;
ovs_net->ipv4_frags.timeout = IP_FRAG_TIME;
inet_frags_init_net(&(ovs_net->ipv4_frags));
ovs_net->net = net;
#endif
}

void ovs_netns_frags_exit(struct net *net)
{
struct netns_frags *frags;

frags = get_netns_frags_from_net(net);
inet_frags_exit_net(frags, &ip4_frags);
}

static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
{
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
Expand Down Expand Up @@ -158,9 +205,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
#endif
{
struct ipq *qp = container_of(q, struct ipq, q);
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
frags);
struct net *net = container_of(ipv4, struct net, ipv4);
struct net *net = get_net_from_netns_frags(q->net);

const struct ip4_create_arg *arg = a;

Expand Down Expand Up @@ -219,7 +264,7 @@ static void ip_expire(unsigned long arg)
struct net *net;

qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
net = get_net_from_netns_frags(qp->q.net);

spin_lock(&qp->q.lock);

Expand Down Expand Up @@ -278,8 +323,10 @@ static void ip_expire(unsigned long arg)
static void ip_evictor(struct net *net)
{
int evicted;
struct netns_frags *frags;

evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
frags = get_netns_frags_from_net(net);
evicted = inet_frag_evictor(frags, &ip4_frags, false);
if (evicted)
IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
}
Expand All @@ -294,6 +341,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
struct inet_frag_queue *q;
struct ip4_create_arg arg;
unsigned int hash;
struct netns_frags *frags;

arg.iph = iph;
arg.user = user;
Expand All @@ -304,7 +352,8 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
#endif
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);

q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
frags = get_netns_frags_from_net(net);
q = inet_frag_find(frags, &ip4_frags, &arg, hash);
if (IS_ERR_OR_NULL(q)) {
inet_frag_maybe_warn_overflow(q, pr_fmt());
return NULL;
Expand Down Expand Up @@ -333,7 +382,7 @@ static int ip_frag_too_far(struct ipq *qp)
if (rc) {
struct net *net;

net = container_of(qp->q.net, struct net, ipv4.frags);
net = get_net_from_netns_frags(qp->q.net);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
}

Expand Down Expand Up @@ -566,7 +615,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct net *net = get_net_from_netns_frags(qp->q.net);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
Expand Down Expand Up @@ -738,7 +787,6 @@ static int __net_init ipv4_frags_init_net(struct net *net)

static void __net_exit ipv4_frags_exit_net(struct net *net)
{
inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}

static struct pernet_operations ip4_frags_ops = {
Expand Down

0 comments on commit 875545f

Please sign in to comment.