Skip to content

Commit

Permalink
kernel 2.6.36: improve sack handling and resource usage
Browse files Browse the repository at this point in the history
upstream commits:
967c05a tcp: enforce tcp_min_snd_mss in tcp_mtu_probing()
5f3e2bf tcp: add tcp_min_snd_mss sysctl
f070ef2 tcp: tcp_fragment() should apply sane memory limits
3b4929f tcp: limit payload size of sacked skbs
f331981 tcp: pass previous skb to tcp_shifted_skb()
  • Loading branch information
themiron committed Jun 20, 2019
1 parent 5e4d494 commit dbd4abf
Show file tree
Hide file tree
Showing 30 changed files with 183 additions and 27 deletions.
Expand Up @@ -158,6 +158,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.

tcp_min_snd_mss - INTEGER
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
as described in RFC 1122 and RFC 6691.
If this ADVMSS option is smaller than tcp_min_snd_mss,
it is silently capped to tcp_min_snd_mss.

Default : 48 (at least 8 bytes of payload per segment)

tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
Expand Down
Expand Up @@ -230,6 +230,7 @@ enum
LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
LINUX_MIB_TCPDEFERACCEPTDROP,
LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
LINUX_MIB_TCPWQUEUETOOBIG , /* TCPWqueueTooBig */
__LINUX_MIB_MAX
};

Expand Down
Expand Up @@ -488,6 +488,9 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
return (struct tcp_timewait_sock *)sk;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
int shiftlen);

#endif /* __KERNEL__ */

#endif /* _LINUX_TCP_H */
3 changes: 3 additions & 0 deletions release/src-rt-6.x.4708/linux/linux-2.6.36/include/net/tcp.h
Expand Up @@ -53,6 +53,8 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER (128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS 48
#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)

/*
* Never offer a window over 32767 without using window scaling. Some
Expand Down Expand Up @@ -240,6 +242,7 @@ extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern int sysctl_tcp_mtu_probing;
extern int sysctl_tcp_base_mss;
extern int sysctl_tcp_min_snd_mss;
extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
Expand Down
1 change: 1 addition & 0 deletions release/src-rt-6.x.4708/linux/linux-2.6.36/net/ipv4/proc.c
Expand Up @@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_SENTINEL
};

Expand Down
Expand Up @@ -26,6 +26,8 @@ static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
static int tcp_min_snd_mss_max = 65535;

/* Update system visible IP port range */
static void set_local_port_range(int range[2])
Expand Down Expand Up @@ -504,6 +506,15 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_min_snd_mss",
.data = &sysctl_tcp_min_snd_mss,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &tcp_min_snd_mss_min,
.extra2 = &tcp_min_snd_mss_max,
},
{
.procname = "tcp_workaround_signed_windows",
.data = &sysctl_tcp_workaround_signed_windows,
Expand Down
1 change: 1 addition & 0 deletions release/src-rt-6.x.4708/linux/linux-2.6.36/net/ipv4/tcp.c
Expand Up @@ -3263,6 +3263,7 @@ void __init tcp_init(void)
int i, max_share, cnt;
unsigned long jiffy = jiffies;

BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));

percpu_counter_init(&tcp_sockets_allocated, 0);
Expand Down
32 changes: 25 additions & 7 deletions release/src-rt-6.x.4708/linux/linux-2.6.36/net/ipv4/tcp_input.c
Expand Up @@ -1376,13 +1376,13 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
return sacked;
}

static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
int dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);

BUG_ON(!pcount);

Expand All @@ -1396,6 +1396,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,

skb_shinfo(prev)->gso_segs += pcount;
BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
skb_shinfo(skb)->gso_segs -= pcount;

/* When we're adding to gso_segs == 1, gso_size will be zero,
Expand Down Expand Up @@ -1463,6 +1464,21 @@ static int skb_can_shift(struct sk_buff *skb)
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
int pcount, int shiftlen)
{
/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
* to make sure not storing more than 65535 * 8 bytes per skb,
* even if current MSS is bigger.
*/
if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
return 0;
if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
return 0;
return skb_shift(to, from, shiftlen);
}

/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
Expand All @@ -1474,6 +1490,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
int mss;
int next_pcount;
int pcount = 0;
int len;
int in_sack;
Expand Down Expand Up @@ -1564,9 +1581,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
}
}

if (!skb_shift(prev, skb, len))
if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;

/* Hole filled allows collapsing with the next as well, this is very
Expand All @@ -1583,9 +1600,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto out;

len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
next_pcount = tcp_skb_pcount(skb);
if (tcp_skb_shift(prev, skb, next_pcount, len)) {
pcount += next_pcount;
tcp_shifted_skb(sk, prev, skb, state, next_pcount, len, mss, 0);
}

out:
Expand Down
Expand Up @@ -59,6 +59,7 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;

int sysctl_tcp_mtu_probing __read_mostly = 0;
int sysctl_tcp_base_mss __read_mostly = 512;
int sysctl_tcp_min_snd_mss __read_mostly = TCP_MIN_SND_MSS;

/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
Expand Down Expand Up @@ -1006,6 +1007,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;

if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
return -ENOMEM;
}

if (skb_cloned(skb) &&
skb_is_nonlinear(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
Expand Down Expand Up @@ -1159,8 +1165,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
mss_now -= icsk->icsk_ext_hdr_len;

/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
mss_now = max(mss_now, sysctl_tcp_min_snd_mss);

/* Now subtract TCP options size, not including SACKs */
mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
Expand Down
Expand Up @@ -127,6 +127,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(sysctl_tcp_base_mss, mss);
mss = max(mss, 68 - tp->tcp_header_len);
mss = max(mss, sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
Expand Down
Expand Up @@ -158,6 +158,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.

tcp_min_snd_mss - INTEGER
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
as described in RFC 1122 and RFC 6691.
If this ADVMSS option is smaller than tcp_min_snd_mss,
it is silently capped to tcp_min_snd_mss.

Default : 48 (at least 8 bytes of payload per segment)

tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
Expand Down
Expand Up @@ -230,6 +230,7 @@ enum
LINUX_MIB_TCPMINTTLDROP, /* RFC 5082 */
LINUX_MIB_TCPDEFERACCEPTDROP,
LINUX_MIB_IPRPFILTER, /* IP Reverse Path Filter (rp_filter) */
LINUX_MIB_TCPWQUEUETOOBIG , /* TCPWqueueTooBig */
__LINUX_MIB_MAX
};

Expand Down
Expand Up @@ -488,6 +488,9 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
return (struct tcp_timewait_sock *)sk;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
int shiftlen);

#endif /* __KERNEL__ */

#endif /* _LINUX_TCP_H */
Expand Up @@ -53,6 +53,8 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER (128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS 48
#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)

/*
* Never offer a window over 32767 without using window scaling. Some
Expand Down Expand Up @@ -240,6 +242,7 @@ extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_abc;
extern int sysctl_tcp_mtu_probing;
extern int sysctl_tcp_base_mss;
extern int sysctl_tcp_min_snd_mss;
extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
Expand Down
Expand Up @@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_SENTINEL
};

Expand Down
Expand Up @@ -26,6 +26,8 @@ static int zero;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
static int tcp_min_snd_mss_max = 65535;

/* Update system visible IP port range */
static void set_local_port_range(int range[2])
Expand Down Expand Up @@ -504,6 +506,15 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_min_snd_mss",
.data = &sysctl_tcp_min_snd_mss,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &tcp_min_snd_mss_min,
.extra2 = &tcp_min_snd_mss_max,
},
{
.procname = "tcp_workaround_signed_windows",
.data = &sysctl_tcp_workaround_signed_windows,
Expand Down
Expand Up @@ -3265,6 +3265,7 @@ void __init tcp_init(void)
int i, max_share, cnt;
unsigned long jiffy = jiffies;

BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));

percpu_counter_init(&tcp_sockets_allocated, 0);
Expand Down
Expand Up @@ -1376,13 +1376,13 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
return sacked;
}

static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
int dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev = tcp_write_queue_prev(sk, skb);

BUG_ON(!pcount);

Expand All @@ -1396,6 +1396,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,

skb_shinfo(prev)->gso_segs += pcount;
BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
skb_shinfo(skb)->gso_segs -= pcount;

/* When we're adding to gso_segs == 1, gso_size will be zero,
Expand Down Expand Up @@ -1463,6 +1464,21 @@ static int skb_can_shift(struct sk_buff *skb)
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
int pcount, int shiftlen)
{
/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
* to make sure not storing more than 65535 * 8 bytes per skb,
* even if current MSS is bigger.
*/
if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
return 0;
if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
return 0;
return skb_shift(to, from, shiftlen);
}

/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
Expand All @@ -1474,6 +1490,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
int mss;
int next_pcount;
int pcount = 0;
int len;
int in_sack;
Expand Down Expand Up @@ -1564,9 +1581,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
}
}

if (!skb_shift(prev, skb, len))
if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;

/* Hole filled allows collapsing with the next as well, this is very
Expand All @@ -1583,9 +1600,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto out;

len = skb->len;
if (skb_shift(prev, skb, len)) {
pcount += tcp_skb_pcount(skb);
tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
next_pcount = tcp_skb_pcount(skb);
if (tcp_skb_shift(prev, skb, next_pcount, len)) {
pcount += next_pcount;
tcp_shifted_skb(sk, prev, skb, state, next_pcount, len, mss, 0);
}

out:
Expand Down

0 comments on commit dbd4abf

Please sign in to comment.