Skip to content
Permalink
Browse files

Implement a CPU-affine TCP and UDP connection lookup data structure,

struct inpcbgroup.  pcbgroups, or "connection groups", supplement the
existing inpcbinfo connection hash table, which when pcbgroups are
enabled, might now be thought of more usefully as a per-protocol
4-tuple reservation table.

Connections are assigned to connection groups base on a hash of their
4-tuple; wildcard sockets require special handling, and are members
of all connection groups.  During a connection lookup, a
per-connection group lock is employed rather than the global pcbinfo
lock.  By aligning connection groups with input path processing,
connection groups take on an effective CPU affinity, especially when
aligned with RSS work placement (see a forthcoming commit for
details).  This eliminates cache line migration associated with
global, protocol-layer data structures in steady state TCP and UDP
processing (with the exception of protocol-layer statistics; further
commit to follow).

Elements of this approach were inspired by Willman, Rixner, and Cox's
2006 USENIX paper, "An Evaluation of Network Stack Parallelization
Strategies in Modern Operating Systems".  However, there are also
significant differences: we maintain the inpcb lock, rather than using
the connection group lock for per-connection state.

Likewise, the focus of this implementation is alignment with NIC
packet distribution strategies such as RSS, rather than pure software
strategies.  Despite that focus, software distribution is supported
through the parallel netisr implementation, and works well in
configurations where the number of hardware threads is greater than
the number of NIC input queues, such as in the RMI XLR threaded MIPS
architecture.

Another important difference is the continued maintenance of existing
hash tables as "reservation tables" -- these are useful both to
distinguish the resource allocation aspect of protocol name management
and the more common-case lookup aspect.  In configurations where
connection tables are aligned with hardware hashes, it is desirable to
use the traditional lookup tables for loopback or encapsulated traffic
rather than take the expense of hardware hashes that are hard to
implement efficiently in software (such as RSS Toeplitz).

Connection group support is enabled by compiling "options PCBGROUP"
into your kernel configuration; for the time being, this is an
experimental feature, and hence is not enabled by default.

Subject to the limited MFCability of change dependencies in inpcb,
and its change to the inpcbinfo init function signature, this change
in principle could be merged to FreeBSD 8.x.

Reviewed by:    bz
Sponsored by:   Juniper Networks, Inc.
  • Loading branch information...
rwatson committed Jun 6, 2011
1 parent 8faa1a4 commit 6e29aea1dbf128b84b885f9acc6396c69ab080ce
@@ -2748,6 +2748,7 @@ netinet/ip_gre.c optional gre inet
netinet/ip_id.c optional inet
netinet/in_mcast.c optional inet
netinet/in_pcb.c optional inet | inet6
netinet/in_pcbgroup.c optional inet pcbgroup | inet6 pcbgroup
netinet/in_proto.c optional inet | inet6 \
compile-with "${NORMAL_C} -I$S/contrib/pf"
netinet/in_rmx.c optional inet
@@ -2825,6 +2826,7 @@ netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6
netinet6/in6_ifattach.c optional inet6
netinet6/in6_mcast.c optional inet6
netinet6/in6_pcb.c optional inet6
netinet6/in6_pcbgroup.c optional inet6 pcbgroup
netinet6/in6_proto.c optional inet6
netinet6/in6_rmx.c optional inet6
netinet6/in6_src.c optional inet6
@@ -419,6 +419,7 @@ MROUTING opt_mrouting.h
NCP
NETATALK opt_atalk.h
NFSLOCKD
PCBGROUP opt_pcbgroup.h
RADIX_MPATH opt_mpath.h
ROUTETABLES opt_route.h
SLIP_IFF_OPTS opt_slip.h
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_pcbgroup.h"

#include <sys/param.h>
#include <sys/systm.h>
@@ -212,7 +213,7 @@ void
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
uint32_t inpcbzone_flags)
uint32_t inpcbzone_flags, u_int hashfields)
{

INP_INFO_LOCK_INIT(pcbinfo, name);
@@ -227,6 +228,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
inpcbzone_flags);
@@ -246,6 +250,9 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
uma_zdestroy(pcbinfo->ipi_zone);
INP_HASH_LOCK_DESTROY(pcbinfo);
INP_INFO_LOCK_DESTROY(pcbinfo);
@@ -1053,7 +1060,8 @@ in_pcbdetach(struct inpcb *inp)
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
* but where the inpcb lock may already held, or when acquiring a reference
* via a pcbgroup.
*
* in_pcbref() should be used only to provide brief memory stability, and
* must always be followed by a call to INP_WLOCK() and in_pcbrele() to
@@ -1223,6 +1231,9 @@ in_pcbdrop(struct inpcb *inp)
}
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
#ifdef PCBGROUP
in_pcbgroup_remove(inp);
#endif
}
}

@@ -1472,6 +1483,148 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
}
#undef INP_LOOKUP_MAPPED_PCB_COST

#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
*/
static struct inpcb *
in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
u_int lport_arg, int lookupflags, struct ifnet *ifp)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
u_short fport = fport_arg, lport = lport_arg;

/*
* First look for an exact match.
*/
tmpinp = NULL;
INP_GROUP_LOCK(pcbgroup);
head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
pcbgroup->ipg_hashmask)];
LIST_FOREACH(inp, head, inp_pcbgrouphash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == faddr.s_addr &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_fport == fport &&
inp->inp_lport == lport) {
/*
* XXX We should be able to directly return
* the inp here, without any checks.
* Well unless both bound with SO_REUSEPORT?
*/
if (prison_flag(inp->inp_cred, PR_IP4))
goto found;
if (tmpinp == NULL)
tmpinp = inp;
}
}
if (tmpinp != NULL) {
inp = tmpinp;
goto found;
}

/*
* Then look for a wildcard match, if requested.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
struct inpcb *local_wild = NULL, *local_exact = NULL;
#ifdef INET6
struct inpcb *local_wild_mapped = NULL;
#endif
struct inpcb *jail_wild = NULL;
struct inpcbhead *head;
int injail;

/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_wildmask)];
LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY ||
inp->inp_lport != lport)
continue;

/* XXX inp locking */
if (ifp && ifp->if_type == IFT_FAITH &&
(inp->inp_flags & INP_FAITH) == 0)
continue;

injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
&laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}

if (inp->inp_laddr.s_addr == laddr.s_addr) {
if (injail)
goto found;
else
local_exact = inp;
} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
#ifdef INET6
/* XXX inp locking, NULL check */
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
#endif /* INET6 */
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
} /* LIST_FOREACH */
inp = jail_wild;
if (inp == NULL)
inp = local_exact;
if (inp == NULL)
inp = local_wild;
#ifdef INET6
if (inp == NULL)
inp = local_wild_mapped;
#endif /* defined(INET6) */
if (inp != NULL)
goto found;
} /* if (lookupflags & INPLOOKUP_WILDCARD) */
INP_GROUP_UNLOCK(pcbgroup);
return (NULL);

found:
in_pcbref(inp);
INP_GROUP_UNLOCK(pcbgroup);
if (lookupflags & INPLOOKUP_WLOCKPCB) {
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp))
return (NULL);
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
INP_RLOCK(inp);
if (in_pcbrele_rlocked(inp))
return (NULL);
} else
panic("%s: locking bug", __func__);
return (inp);
}
#endif /* PCBGROUP */

/*
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes
* that the caller has locked the hash list, and will not perform any further
@@ -1636,17 +1789,30 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
/*
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
* from which a pre-calculated hash value may be extracted.
*
* Possibly more of this logic should be in in_pcbgroup.c.
*/
struct inpcb *
in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
{
#if defined(PCBGROUP)
struct inpcbgroup *pcbgroup;
#endif

KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));

#if defined(PCBGROUP)
if (in_pcbgroup_enabled(pcbinfo)) {
pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
fport);
return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
laddr, lport, lookupflags, ifp));
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, ifp));
}
@@ -1656,12 +1822,28 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
struct ifnet *ifp, struct mbuf *m)
{
#ifdef PCBGROUP
struct inpcbgroup *pcbgroup;
#endif

KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
("%s: LOCKPCB not set", __func__));

#ifdef PCBGROUP
if (in_pcbgroup_enabled(pcbinfo)) {
pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
m->m_pkthdr.flowid);
if (pcbgroup != NULL)
return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
fport, laddr, lport, lookupflags, ifp));
pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
fport);
return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
laddr, lport, lookupflags, ifp));
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
lookupflags, ifp));
}
@@ -1670,8 +1852,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
/*
* Insert PCB onto various hash lists.
*/
int
in_pcbinshash(struct inpcb *inp)
static int
in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
{
struct inpcbhead *pcbhash;
struct inpcbporthead *pcbporthash;
@@ -1721,9 +1903,38 @@ in_pcbinshash(struct inpcb *inp)
LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
#ifdef PCBGROUP
if (do_pcbgroup_update)
in_pcbgroup_update(inp);
#endif
return (0);
}

/*
* For now, there are two public interfaces to insert an inpcb into the hash
* lists -- one that does update pcbgroups, and one that doesn't. The latter
* is used only in the TCP syncache, where in_pcbinshash is called before the
* full 4-tuple is set for the inpcb, and we don't want to install in the
* pcbgroup until later.
*
* XXXRW: This seems like a misfeature. in_pcbinshash should always update
* connection groups, and partially initialised inpcbs should not be exposed
* to either reservation hash tables or pcbgroups.
*/
int
in_pcbinshash(struct inpcb *inp)
{

return (in_pcbinshash_internal(inp, 1));
}

int
in_pcbinshash_nopcbgroup(struct inpcb *inp)
{

return (in_pcbinshash_internal(inp, 0));
}

/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
@@ -1755,6 +1966,13 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)

LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);

#ifdef PCBGROUP
if (m != NULL)
in_pcbgroup_update_mbuf(inp, m);
else
in_pcbgroup_update(inp);
#endif
}

void
@@ -1791,6 +2009,9 @@ in_pcbremlists(struct inpcb *inp)
}
LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
#ifdef PCBGROUP
in_pcbgroup_remove(inp);
#endif
}

/*

0 comments on commit 6e29aea

Please sign in to comment.
You can’t perform that action at this time.