Skip to content

Commit

Permalink
Fix the design problem with delayed algorithm sync.
Browse files Browse the repository at this point in the history
Currently, if the immutable algorithm like bsearch or radix_lockless
 receives rtable update notification, it schedules algorithm rebuild.
This rebuild is executed by the callout after ~50 milliseconds.

It is possible that a script adding an interface address and than route
 with the gateway bound to that address will fail. It can happen due
 to the fact that fib is not updated by the time the route addition
 request arrives.

Fix this by allowing synchronous algorithm rebuilds based on certain
 conditions. By default, these conditions assume:
1) less than net.route.algo.fib_sync_limit=100 routes
2) routes without gateway.

* Move algo instance build entirely under rib WLOCK.
 Rib lock is only used for control plane (except radix algo, but there
  are no rebuilds).
* Add rib_walk_ext_locked() function to allow RIB iteration with
 rib lock already held.
* Fix rare potential callout use-after-free for fds by binding fd
 callout to the relevant rib rmlock. In that case, callout_stop()
 under rib WLOCK guarantees no callout will be executed afterwards.

MFC after:	3 days
  • Loading branch information
AlexanderChernikov committed Jan 30, 2021
1 parent dd91630 commit 151ec79
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 42 deletions.
110 changes: 73 additions & 37 deletions sys/net/route/fib_algo.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ SYSCTL_DECL(_net_route);
SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Fib algorithm lookups");

VNET_DEFINE(int, fib_sync_limit) = 100;
#define V_fib_sync_limit VNET(fib_sync_limit)
SYSCTL_INT(_net_route_algo, OID_AUTO, fib_sync_limit, CTLFLAG_RW | CTLFLAG_VNET,
&VNET_NAME(fib_sync_limit), 0, "Guarantee synchronous fib till route limit");

#ifdef INET6
VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
#define V_algo_fixed_inet6 VNET(algo_fixed_inet6)
Expand Down Expand Up @@ -465,7 +470,8 @@ static void
schedule_fd_rebuild(struct fib_data *fd, const char *reason)
{

FIB_MOD_LOCK();
RIB_WLOCK_ASSERT(fd->fd_rh);

if (!fd->fd_need_rebuild) {
fd->fd_need_rebuild = true;

Expand All @@ -477,30 +483,52 @@ schedule_fd_rebuild(struct fib_data *fd, const char *reason)
reason, fd->fd_failed_rebuilds);
schedule_callout(fd, callout_calc_delay_ms(fd));
}
FIB_MOD_UNLOCK();
}

static void
schedule_algo_eval(struct fib_data *fd)
{

RIB_WLOCK_ASSERT(fd->fd_rh);

if (fd->fd_num_changes++ == 0) {
/* Start callout to consider switch */
FIB_MOD_LOCK();
if (!callout_pending(&fd->fd_callout))
schedule_callout(fd, ALGO_EVAL_DELAY_MS);
FIB_MOD_UNLOCK();
} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
/* Reset callout to exec immediately */
FIB_MOD_LOCK();
if (!fd->fd_need_rebuild) {
fd->fd_force_eval = true;
schedule_callout(fd, 1);
}
FIB_MOD_UNLOCK();
}
}

static bool
need_immediate_rebuild(struct fib_data *fd, struct rib_cmd_info *rc)
{
struct nhop_object *nh;

if ((V_fib_sync_limit == 0) || (fd->fd_rh->rnh_prefixes <= V_fib_sync_limit))
return (true);

/* Sync addition/removal of interface routes */
switch (rc->rc_cmd) {
case RTM_ADD:
nh = rc->rc_nh_new;
if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
return (true);
break;
case RTM_DELETE:
nh = rc->rc_nh_old;
if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
return (true);
break;
}

return (false);
}

/*
* Rib subscription handler. Checks if the algorithm is ready to
* receive updates, handles nexthop refcounting and passes change
Expand Down Expand Up @@ -559,7 +587,15 @@ handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
* Algo is not able to apply the update.
* Schedule algo rebuild.
*/
schedule_fd_rebuild(fd, "algo requested rebuild");
if (!need_immediate_rebuild(fd, rc)) {
schedule_fd_rebuild(fd, "algo requested rebuild");
break;
}

fd->fd_need_rebuild = true;
FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
if (!rebuild_fd(fd))
schedule_fd_rebuild(fd, "sync rebuild failed");
break;
case FLM_ERROR:

Expand Down Expand Up @@ -678,7 +714,7 @@ sync_algo(struct fib_data *fd)
.result = FLM_SUCCESS,
};

rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);

FD_PRINTF(LOG_INFO, fd,
"initial dump completed (rtable version: %d), result: %s",
Expand All @@ -702,6 +738,7 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
bool is_dead;

NET_EPOCH_ASSERT();
RIB_WLOCK_ASSERT(fd->fd_rh);

FIB_MOD_LOCK();
is_dead = fd->fd_dead;
Expand All @@ -718,27 +755,13 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
FD_PRINTF(LOG_INFO, fd, "DETACH");

if (fd->fd_rs != NULL)
rib_unsibscribe(fd->fd_rs);
rib_unsibscribe_locked(fd->fd_rs);

/*
* After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
* will be executed, hence no _new_ callout schedules will happen.
*
* There can be 2 possible scenarious here:
* 1) we're running inside a callout when we're deleting ourselves
* due to migration to a newer fd
* 2) we're running from rt_table_destroy() and callout is scheduled
* for execution OR is executing
*
* For (2) we need to wait for the callout termination, as the routing table
* will be destroyed after this function returns.
* For (1) we cannot call drain, but can ensure that this is the last invocation.
*/

if (in_callout)
callout_stop(&fd->fd_callout);
else
callout_drain(&fd->fd_callout);
callout_stop(&fd->fd_callout);

epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
&fd->fd_epoch_ctx);
Expand Down Expand Up @@ -774,7 +797,11 @@ fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
/* Pass 2: remove each entry */
NET_EPOCH_ENTER(et);
TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
if (!in_callout)
RIB_WLOCK(fd->fd_rh);
schedule_destroy_fd_instance(fd, in_callout);
if (!in_callout)
RIB_WUNLOCK(fd->fd_rh);
}
NET_EPOCH_EXIT(et);
}
Expand Down Expand Up @@ -870,7 +897,7 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
fd->fd_gen = ++fib_gen;
fd->fd_family = rh->rib_family;
fd->fd_fibnum = rh->rib_fibnum;
callout_init(&fd->fd_callout, 1);
callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
fd->fd_vnet = curvnet;
fd->fd_flm = flm;

Expand Down Expand Up @@ -908,8 +935,8 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,

/* Try to subscribe */
if (flm->flm_change_rib_item_cb != NULL) {
fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
if (fd->fd_rs == NULL) {
FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
return (FLM_REBUILD);
Expand Down Expand Up @@ -946,13 +973,14 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
{
struct fib_data *prev_fd, *new_fd;
struct epoch_tracker et;
enum flm_op_result result;

NET_EPOCH_ASSERT();
RIB_WLOCK_ASSERT(rh);

prev_fd = orig_fd;
new_fd = NULL;
for (int i = 0; i < FIB_MAX_TRIES; i++) {
NET_EPOCH_ENTER(et);
result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);

if ((result == FLM_SUCCESS) && attach)
Expand All @@ -962,7 +990,6 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
schedule_destroy_fd_instance(prev_fd, false);
prev_fd = NULL;
}
NET_EPOCH_EXIT(et);

RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
print_op_result(result));
Expand Down Expand Up @@ -990,14 +1017,12 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
if (result == FLM_ERROR)
flm_error_add(flm, rh->rib_fibnum);

NET_EPOCH_ENTER(et);
if ((prev_fd != NULL) && (prev_fd != orig_fd))
schedule_destroy_fd_instance(prev_fd, false);
if (new_fd != NULL) {
schedule_destroy_fd_instance(new_fd, false);
new_fd = NULL;
}
NET_EPOCH_EXIT(et);
}

*pfd = new_fd;
Expand All @@ -1013,12 +1038,15 @@ static void
rebuild_fd_callout(void *_data)
{
struct fib_data *fd = (struct fib_data *)_data;
struct epoch_tracker et;

FD_PRINTF(LOG_INFO, fd, "running callout rebuild");

NET_EPOCH_ENTER(et);
CURVNET_SET(fd->fd_vnet);
rebuild_fd(fd);
CURVNET_RESTORE();
NET_EPOCH_EXIT(et);
}

/*
Expand All @@ -1030,17 +1058,16 @@ rebuild_fd(struct fib_data *fd)
{
struct fib_data *fd_new, *fd_tmp;
struct fib_lookup_module *flm_new = NULL;
struct epoch_tracker et;
enum flm_op_result result;
bool need_rebuild = false;

NET_EPOCH_ASSERT();
RIB_WLOCK_ASSERT(fd->fd_rh);

FIB_MOD_LOCK();
need_rebuild = fd->fd_need_rebuild;
fd->fd_need_rebuild = false;
fd->fd_force_eval = false;
fd->fd_num_changes = 0;
FIB_MOD_UNLOCK();

/* First, check if we're still OK to use this algo */
if (!is_algo_fixed(fd->fd_rh))
Expand Down Expand Up @@ -1069,9 +1096,7 @@ rebuild_fd(struct fib_data *fd)
FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");

/* Remove old instance */
NET_EPOCH_ENTER(et);
schedule_destroy_fd_instance(fd, true);
NET_EPOCH_EXIT(et);

return (true);
}
Expand Down Expand Up @@ -1116,6 +1141,7 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
char old_algo_name[32], algo_name[32];
struct rib_head *rh = NULL;
enum flm_op_result result;
struct epoch_tracker et;
int error;

/* Fetch current algo/rib for af/family */
Expand Down Expand Up @@ -1149,7 +1175,11 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
}

fd = NULL;
NET_EPOCH_ENTER(et);
RIB_WLOCK(rh);
result = setup_fd_instance(flm, rh, NULL, &fd, true);
RIB_WUNLOCK(rh);
NET_EPOCH_EXIT(et);
fib_unref_algo(flm);
if (result != FLM_SUCCESS)
return (EINVAL);
Expand Down Expand Up @@ -1558,6 +1588,7 @@ fib_select_algo_initial(struct rib_head *rh)
struct fib_lookup_module *flm;
struct fib_data *fd = NULL;
enum flm_op_result result;
struct epoch_tracker et;
int error = 0;

flm = fib_check_best_algo(rh, NULL);
Expand All @@ -1567,7 +1598,12 @@ fib_select_algo_initial(struct rib_head *rh)
}
RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);

NET_EPOCH_ENTER(et);
RIB_WLOCK(rh);
result = setup_fd_instance(flm, rh, NULL, &fd, false);
RIB_WUNLOCK(rh);
NET_EPOCH_EXIT(et);

RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
if (result == FLM_SUCCESS) {

Expand Down
2 changes: 2 additions & 0 deletions sys/net/route/route_ctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f,
rib_walk_hook_f_t *hook_f, void *arg);
void rib_walk_ext_internal(struct rib_head *rnh, bool wlock,
rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg);
void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
rib_walk_hook_f_t *hook_f, void *arg);

void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f,
void *arg, bool report);
Expand Down
17 changes: 12 additions & 5 deletions sys/net/route/route_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,17 @@ __FBSDID("$FreeBSD$");
* RIB helper functions.
*/

void
rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
rib_walk_hook_f_t *hook_f, void *arg)
{
if (hook_f != NULL)
hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
if (hook_f != NULL)
hook_f(rnh, RIB_WALK_HOOK_POST, arg);
}

/*
* Calls @wa_f with @arg for each entry in the table specified by
* @af and @fibnum.
Expand All @@ -86,11 +97,7 @@ rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
RIB_WLOCK(rnh);
else
RIB_RLOCK(rnh);
if (hook_f != NULL)
hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
if (hook_f != NULL)
hook_f(rnh, RIB_WALK_HOOK_POST, arg);
rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
if (wlock)
RIB_WUNLOCK(rnh);
else
Expand Down

0 comments on commit 151ec79

Please sign in to comment.