Skip to content

Commit

Permalink
This commit brings in a new refactored TCP stack called Rack.
Browse files Browse the repository at this point in the history
Rack includes the following features:
 - A different SACK processing scheme (the old sack structures are not used).
 - RACK (Recent acknowledgment) where counting dup-acks is no longer done
        instead time is used to knwo when to retransmit. (see the I-D)
 - TLP (Tail Loss Probe) where we will probe for tail-losses to attempt
        to try not to take a retransmit time-out. (see the I-D)
 - Burst mitigation using TCPHTPS
 - PRR (partial rate reduction) see the RFC.

Once built into your kernel, you can select this stack by either
socket option with the name of the stack is "rack" or by setting
the global sysctl so the default is rack.

Note that any connection that does not support SACK will be kicked
back to the "default" base  FreeBSD stack (currently known as "default").

To build this into your kernel you will need to enable in your
kernel:
   makeoptions WITH_EXTRA_TCP_STACKS=1
   options TCPHPTS

Sponsored by:	Netflix Inc.
Differential Revision:		https://reviews.freebsd.org/D15525
  • Loading branch information
rrs authored and rrs committed Jun 7, 2018
1 parent 59bb646 commit e4ec942
Show file tree
Hide file tree
Showing 19 changed files with 10,766 additions and 25 deletions.
49 changes: 49 additions & 0 deletions sys/kern/uipc_sockbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1283,6 +1283,55 @@ sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
return (ret);
}

struct mbuf *
sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff)
{
struct mbuf *m;

KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
*moff = off;
if (sb->sb_sndptr == NULL) {
sb->sb_sndptr = sb->sb_mb;
sb->sb_sndptroff = 0;
}
return (sb->sb_mb);
} else {
m = sb->sb_sndptr;
off -= sb->sb_sndptroff;
}
*moff = off;
return (m);
}

void
sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len)
{
/*
* A small copy was done, advance forward the sb_sbsndptr to cover
* it.
*/
struct mbuf *m;

if (mb != sb->sb_sndptr) {
/* Did not copyout at the same mbuf */
return;
}
m = mb;
while (m && (len > 0)) {
if (len >= m->m_len) {
len -= m->m_len;
if (m->m_next) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
m = m->m_next;
} else {
len = 0;
}
}
}

/*
* Return the first mbuf and the mbuf data offset for the provided
* send offset without changing the "sb_sndptroff" field.
Expand Down
2 changes: 2 additions & 0 deletions sys/modules/tcp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ SYSDIR?=${SRCTOP}/sys

SUBDIR= \
${_tcp_fastpath} \
${_tcp_rack} \
${_tcpmd5} \

.if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES)
_tcp_fastpath= fastpath
_tcp_rack= rack
.endif

.if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \
Expand Down
24 changes: 24 additions & 0 deletions sys/modules/tcp/rack/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#
# $FreeBSD$
#

.PATH: ${.CURDIR}/../../../netinet/tcp_stacks

STACKNAME= rack
KMOD= tcp_${STACKNAME}
SRCS= rack.c sack_filter.c

SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_tcpdebug.h
SRCS+= opt_kern_tls.h

#
# Enable full debugging
#
#CFLAGS += -g

CFLAGS+= -DMODNAME=${KMOD}
CFLAGS+= -DSTACKNAME=${STACKNAME}
CFLAGS+= -DSTACKALIAS=rack_18q21

.include <bsd.kmod.mk>
56 changes: 56 additions & 0 deletions sys/netinet/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ struct tcphdr {
device */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
Expand All @@ -184,6 +185,61 @@ struct tcphdr {
#define TCP_PCAP_OUT 2048 /* number of output packets to keep */
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Options for Rack and BBR */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
#define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
#define TCP_BBR_PACE_PER_SEC 1086
#define TCP_BBR_PACE_DEL_TAR 1087
#define TCP_BBR_PACE_SEG_MAX 1088
#define TCP_BBR_PACE_SEG_MIN 1089
#define TCP_BBR_PACE_CROSS 1090
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
#define TCP_RACK_TLP_USE 1095
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
#define TCP_BBR_EXTRA_GAIN 1097
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
#define TCP_DATA_AFTER_CLOSE 1100
#define TCP_BBR_PROBE_RTT_GAIN 1101
#define TCP_BBR_PROBE_RTT_LEN 1102


/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR

Expand Down
2 changes: 1 addition & 1 deletion sys/netinet/tcp_log_buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ struct tcp_log_bbr {
uint16_t flex7;
uint8_t bbr_state;
uint8_t bbr_substate;
uint8_t inpacer;
uint8_t inhpts;
uint8_t ininput;
uint8_t use_lt_bw;
uint8_t flex8;
Expand Down
145 changes: 139 additions & 6 deletions sys/netinet/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,18 +143,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_R
tcp_timer_active((tp), TT_PERSIST), \
("neither rexmt nor persist timer is set"))

#ifdef TCP_HHOOK
static void inline hhook_run_tcp_est_out(struct tcpcb *tp,
struct tcphdr *th, struct tcpopt *to,
uint32_t len, int tso);
#endif
static void inline cc_after_idle(struct tcpcb *tp);

#ifdef TCP_HHOOK
/*
* Wrapper for the TCP established output helper hook.
*/
static void inline
void
hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th,
struct tcpopt *to, uint32_t len, int tso)
{
Expand Down Expand Up @@ -1851,6 +1846,144 @@ tcp_addoptions(struct tcpopt *to, u_char *optp)
return (optlen);
}

/*
* This is a copy of m_copym(), taking the TSO segment size/limit
* constraints into account, and advancing the sndptr as it goes.
*/
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb)
{
struct mbuf *n, **np;
struct mbuf *top;
int32_t off = off0;
int32_t len = *plen;
int32_t fragsize;
int32_t len_cp = 0;
int32_t *pkthdrlen;
uint32_t mlen, frags;
bool copyhdr;


KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off));
KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len));
if (off == 0 && m->m_flags & M_PKTHDR)
copyhdr = true;
else
copyhdr = false;
while (off > 0) {
KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain"));
if (off < m->m_len)
break;
off -= m->m_len;
if ((sb) && (m == sb->sb_sndptr)) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
m = m->m_next;
}
np = &top;
top = NULL;
pkthdrlen = NULL;
while (len > 0) {
if (m == NULL) {
KASSERT(len == M_COPYALL,
("tcp_m_copym, length > size of mbuf chain"));
*plen = len_cp;
if (pkthdrlen != NULL)
*pkthdrlen = len_cp;
break;
}
mlen = min(len, m->m_len - off);
if (seglimit) {
/*
* For M_NOMAP mbufs, add 3 segments
* + 1 in case we are crossing page boundaries
* + 2 in case the TLS hdr/trailer are used
* It is cheaper to just add the segments
* than it is to take the cache miss to look
* at the mbuf ext_pgs state in detail.
*/
if (m->m_flags & M_NOMAP) {
fragsize = min(segsize, PAGE_SIZE);
frags = 3;
} else {
fragsize = segsize;
frags = 0;
}

/* Break if we really can't fit anymore. */
if ((frags + 1) >= seglimit) {
*plen = len_cp;
if (pkthdrlen != NULL)
*pkthdrlen = len_cp;
break;
}

/*
* Reduce size if you can't copy the whole
* mbuf. If we can't copy the whole mbuf, also
* adjust len so the loop will end after this
* mbuf.
*/
if ((frags + howmany(mlen, fragsize)) >= seglimit) {
mlen = (seglimit - frags - 1) * fragsize;
len = mlen;
*plen = len_cp + len;
if (pkthdrlen != NULL)
*pkthdrlen = *plen;
}
frags += howmany(mlen, fragsize);
if (frags == 0)
frags++;
seglimit -= frags;
KASSERT(seglimit > 0,
("%s: seglimit went too low", __func__));
}
if (copyhdr)
n = m_gethdr(M_NOWAIT, m->m_type);
else
n = m_get(M_NOWAIT, m->m_type);
*np = n;
if (n == NULL)
goto nospace;
if (copyhdr) {
if (!m_dup_pkthdr(n, m, M_NOWAIT))
goto nospace;
if (len == M_COPYALL)
n->m_pkthdr.len -= off0;
else
n->m_pkthdr.len = len;
pkthdrlen = &n->m_pkthdr.len;
copyhdr = false;
}
n->m_len = mlen;
len_cp += n->m_len;
if (m->m_flags & M_EXT) {
n->m_data = m->m_data + off;
mb_dupcl(n, m);
} else
bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
(u_int)n->m_len);

if (sb && (sb->sb_sndptr == m) &&
((n->m_len + off) >= m->m_len) && m->m_next) {
sb->sb_sndptroff += m->m_len;
sb->sb_sndptr = m->m_next;
}
off = 0;
if (len != M_COPYALL) {
len -= n->m_len;
}
m = m->m_next;
np = &n->m_next;
}
return (top);
nospace:
m_freem(top);
return (NULL);
}

void
tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin)
{
Expand Down
2 changes: 1 addition & 1 deletion sys/netinet/tcp_stacks/fastpath.c
Original file line number Diff line number Diff line change
Expand Up @@ -2392,7 +2392,7 @@ struct tcp_function_block __tcp_fastack = {
static int
tcp_addfastpaths(module_t mod, int type, void *data)
{
int err=0;
int err = 0;

switch (type) {
case MOD_LOAD:
Expand Down
Loading

0 comments on commit e4ec942

Please sign in to comment.