Skip to content

Commit

Permalink
pim: Retry mcast join if it fails.
Browse files Browse the repository at this point in the history
Seems that races with interface creation/deletion can cause
this to fail in FEA due to not finding VIF there.  Retry
the joins...

Signed-off-by: Ben Greear <greearb@candelatech.com>
  • Loading branch information
greearb committed Sep 7, 2012
1 parent d8f1b1d commit 8c6d10c
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 22 deletions.
48 changes: 33 additions & 15 deletions xorp/pim/pim_vif.cc
Expand Up @@ -152,6 +152,8 @@ PimVif::PimVif(PimNode* pim_node, const Vif& vif)

set_default_config();

needs_join = false;

// Check for any cached configuration.
map<string, PVifPermInfo>::iterator i = perm_info.find(name());
if (i != perm_info.end()) {
Expand Down Expand Up @@ -335,6 +337,8 @@ PimVif::pim_mrt() const
/** System detected some change. */
void PimVif::notifyUpdated() {
int perm_started = -1;
string err_msg;

if (!wants_to_be_started) {
map<string, PVifPermInfo>::iterator i = perm_info.find(name());
if (i != perm_info.end()) {
Expand All @@ -345,7 +349,6 @@ void PimVif::notifyUpdated() {
XLOG_INFO("notifyUpdated, vif: %s wants-to-be-started: %i, perm-should-start: %i",
name().c_str(), (int)(wants_to_be_started), perm_started);
if (wants_to_be_started || (perm_started == 1)) {
string err_msg;
int rv = start(err_msg, "notifyUpdated, wants to be started");
if (rv == XORP_OK) {
XLOG_WARNING("notifyUpdated, successfully started pim_vif: %s",
Expand All @@ -356,8 +359,31 @@ void PimVif::notifyUpdated() {
name().c_str(), err_msg.c_str());
}
}
else {
// Maybe we need to (re)join?
if (needs_join) {
needs_join = false; // assume good things
try_join(err_msg);
}
}
}

int PimVif::try_join(string& error_msg) {
// Join the appropriate multicast groups: ALL-PIM-ROUTERS
const IPvX group = IPvX::PIM_ROUTERS(family());
if (pim_node()->join_multicast_group(name(), name(),
pim_node()->ip_protocol_number(),
group)
!= XORP_OK) {
// NOTE: This can still fail, but we don't notice until later
// when we get the XRL callback response back. Will do fixup then
// as needed.
error_msg = c_format("cannot join group %s on vif %s",
cstring(group), name().c_str());
return XORP_ERROR;
}
return XORP_OK;
}

/**
* PimVif::start:
Expand Down Expand Up @@ -399,7 +425,7 @@ int PimVif::start(string& error_msg, const char* dbg) {

if (! is_underlying_vif_up()) {
wants_to_be_started = true;
XLOG_WARNING("WARNING: Delaying start of pim-vif: %s because underlying vif is not up.",
XLOG_WARNING("Delaying start of pim-vif: %s because underlying vif is not up.",
name().c_str());
return XORP_OK;
}
Expand Down Expand Up @@ -460,19 +486,11 @@ int PimVif::start(string& error_msg, const char* dbg) {
return (XORP_ERROR);
}

if (! is_pim_register()) {
//
// Join the appropriate multicast groups: ALL-PIM-ROUTERS
//
const IPvX group = IPvX::PIM_ROUTERS(family());
if (pim_node()->join_multicast_group(name(),
name(),
pim_node()->ip_protocol_number(),
group)
!= XORP_OK) {
error_msg = c_format("cannot join group %s on vif %s",
cstring(group), name().c_str());
return (XORP_ERROR);
if (! is_pim_register()) {
needs_join = false;
if (try_join(error_msg) != XORP_OK) {
XLOG_WARNING("%s", error_msg.c_str());
needs_join = true;
}

pim_hello_start();
Expand Down
5 changes: 5 additions & 0 deletions xorp/pim/pim_vif.hh
Expand Up @@ -190,6 +190,10 @@ public:
*/
int start(string& error_msg, const char* dbg);

/** Try to join the PIM routers mcast group. */
int try_join(string& error_msg);
void setNeedsJoin(bool v) { needs_join = v; }

/** Attempt deferred start.
*/
void notifyUpdated();
Expand Down Expand Up @@ -782,6 +786,7 @@ private:
const IPvX& group_addr,
uint8_t group_mask_len);
bool wants_to_be_started; // as soon as we can, ie if the interface appears.
bool needs_join; // Need to (re)join the pim mcast group?
};

#endif // __PIM_PIM_VIF_HH__
25 changes: 21 additions & 4 deletions xorp/pim/xrl_pim_node.cc
Expand Up @@ -1543,11 +1543,28 @@ XrlPimNode::fea_client_send_join_leave_multicast_group_cb(
case COMMAND_FAILED:
//
// If a command failed because the other side rejected it, this is
// fatal.
// bad, but maybe we can retry....
//
XLOG_WARNING("Cannot %s a multicast group with the FEA: %s",
entry->operation_name(),
xrl_error.str().c_str());
XLOG_WARNING("Cannot %s a multicast group with the FEA, retries: %i: %s",
entry->operation_name(), entry->retries(),
xrl_error.str().c_str());
if (entry->is_join()) {
// Need to retry this later
if (entry->retries() > 1) {
// tell perm logic to retry
PimVif *pim_vif = PimNode::vif_find_by_name(entry->vif_name());
if (pim_vif) {
pim_vif->setNeedsJoin(true);
}
}
else {
// Just retry..maybe things are better now
XLOG_WARNING("Will retry failed XRL...\n");
entry->set_retries(entry->retries() + 1);
retry_xrl_task();
return;
}
}
break;

case NO_FINDER:
Expand Down
9 changes: 6 additions & 3 deletions xorp/pim/xrl_pim_node.hh
Expand Up @@ -2324,26 +2324,29 @@ private:
_vif_name(vif_name),
_ip_protocol(ip_protocol),
_group_address(group_address),
_is_join(is_join) {}
_is_join(is_join), _retries(0) {}

void dispatch() {
void dispatch() {
_xrl_pim_node.send_join_leave_multicast_group();
}
const char* operation_name() const {
const char* operation_name() const {
return ((_is_join)? "join" : "leave");
}
const string& if_name() const { return _if_name; }
const string& vif_name() const { return _vif_name; }
uint8_t ip_protocol() const { return _ip_protocol; }
const IPvX& group_address() const { return _group_address; }
bool is_join() const { return _is_join; }
int retries() const { return _retries; }
void set_retries(int i) { _retries = i; }

private:
string _if_name;
string _vif_name;
uint8_t _ip_protocol;
IPvX _group_address;
bool _is_join;
int _retries;
};

/**
Expand Down

0 comments on commit 8c6d10c

Please sign in to comment.