Skip to content

Commit 614e831

Browse files
edumazetdavem330
authored andcommitted
tcp: add support for usec resolution in TCP TS values
Back in 2015, Van Jacobson suggested to use usec resolution in TCP TS values. This has been implemented in our private kernels. Goals were : 1) better observability of delays in networking stacks. 2) better disambiguation of events based on TSval/ecr values. 3) building block for congestion control modules needing usec resolution. Back then we implemented a schem based on private SYN options to negotiate the feature. For upstream submission, we chose to use a route attribute, because this feature is probably going to be used in private networks [1] [2]. ip route add 10/8 ... features tcp_usec_ts Note that RFC 7323 recommends a "timestamp clock frequency in the range 1 ms to 1 sec per tick.", but also mentions "the maximum acceptable clock frequency is one tick every 59 ns." [1] Unfortunately RFC 7323 5.5 (Outdated Timestamps) suggests to invalidate TS.Recent values after a flow was idle for more than 24 days. This is the part making usec_ts a problem for peers following this recommendation for long living idle flows. [2] Attempts to standardize usec ts went nowhere: https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/ Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent af77214 commit 614e831

File tree

11 files changed

+82
-33
lines changed

11 files changed

+82
-33
lines changed

include/linux/tcp.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ struct tcp_request_sock {
152152
u64 snt_synack; /* first SYNACK sent time */
153153
bool tfo_listener;
154154
bool is_mptcp;
155+
s8 req_usec_ts;
155156
#if IS_ENABLED(CONFIG_MPTCP)
156157
bool drop_req;
157158
#endif
@@ -257,7 +258,8 @@ struct tcp_sock {
257258
u8 compressed_ack;
258259
u8 dup_ack_counter:2,
259260
tlp_retrans:1, /* TLP is a retransmission */
260-
unused:5;
261+
tcp_usec_ts:1, /* TSval values in usec */
262+
unused:4;
261263
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
262264
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
263265
u8 chrono_type:2, /* current chronograph type */

include/net/inet_timewait_sock.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
6767
/* And these are ours. */
6868
unsigned int tw_transparent : 1,
6969
tw_flowlabel : 20,
70-
tw_pad : 3, /* 3 bits hole */
70+
tw_usec_ts : 1,
71+
tw_pad : 2, /* 2 bits hole */
7172
tw_tos : 8;
7273
u32 tw_txhash;
7374
u32 tw_priority;

include/net/tcp.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -825,6 +825,8 @@ static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
825825

826826
static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
827827
{
828+
if (tp->tcp_usec_ts)
829+
return tp->tcp_mstamp;
828830
return tcp_time_stamp_ms(tp);
829831
}
830832

@@ -852,12 +854,12 @@ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
852854

853855
static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
854856
{
855-
return tcp_clock_ts(false) + tcptw->tw_ts_offset;
857+
return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
856858
}
857859

858860
static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
859861
{
860-
return tcp_clock_ts(false) + treq->ts_off;
862+
return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
861863
}
862864

863865
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

net/ipv4/syncookies.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
8484
if (ts > ts_now)
8585
ts -= (1UL << TSBITS);
8686

87-
return ts * (NSEC_PER_SEC / TCP_TS_HZ);
87+
if (tcp_rsk(req)->req_usec_ts)
88+
return ts * NSEC_PER_USEC;
89+
return ts * NSEC_PER_MSEC;
8890
}
8991

9092

@@ -304,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
304306
treq->af_specific = af_ops;
305307

306308
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
309+
treq->req_usec_ts = -1;
310+
307311
#if IS_ENABLED(CONFIG_MPTCP)
308312
treq->is_mptcp = sk_is_mptcp(sk);
309313
if (treq->is_mptcp) {

net/ipv4/tcp.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
36293629
tp->fastopen_no_cookie = val;
36303630
break;
36313631
case TCP_TIMESTAMP:
3632-
if (!tp->repair)
3632+
if (!tp->repair) {
36333633
err = -EPERM;
3634-
else
3635-
WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(false));
3634+
break;
3635+
}
3636+
/* val is an opaque field,
3637+
* and low order bit contains usec_ts enable bit.
3638+
* Its a best effort, and we do not care if user makes an error.
3639+
*/
3640+
tp->tcp_usec_ts = val & 1;
3641+
WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
36363642
break;
36373643
case TCP_REPAIR_WINDOW:
36383644
err = tcp_repair_set_window(tp, optval, optlen);
@@ -4143,7 +4149,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
41434149
break;
41444150

41454151
case TCP_TIMESTAMP:
4146-
val = tcp_clock_ts(false) + READ_ONCE(tp->tsoffset);
4152+
val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
4153+
if (tp->tcp_usec_ts)
4154+
val |= 1;
4155+
else
4156+
val &= ~1;
41474157
break;
41484158
case TCP_NOTSENT_LOWAT:
41494159
val = READ_ONCE(tp->notsent_lowat);

net/ipv4/tcp_input.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,8 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
698698
u32 delta, delta_us;
699699

700700
delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
701+
if (tp->tcp_usec_ts)
702+
return delta;
701703

702704
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
703705
if (!delta)
@@ -2452,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
24522454
const struct sk_buff *skb)
24532455
{
24542456
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2455-
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(false, skb));
2457+
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
24562458
}
24572459

24582460
/* Nothing was retransmitted or returned timestamp is less
@@ -7045,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
70457047
req->syncookie = want_cookie;
70467048
tcp_rsk(req)->af_specific = af_ops;
70477049
tcp_rsk(req)->ts_off = 0;
7050+
tcp_rsk(req)->req_usec_ts = -1;
70487051
#if IS_ENABLED(CONFIG_MPTCP)
70497052
tcp_rsk(req)->is_mptcp = 0;
70507053
#endif

net/ipv4/tcp_ipv4.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
296296
rt = NULL;
297297
goto failure;
298298
}
299+
tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
299300
/* OK, now commit destination to socket. */
300301
sk->sk_gso_type = SKB_GSO_TCPV4;
301302
sk_setup_caps(sk, &rt->dst);

net/ipv4/tcp_minisocks.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
300300
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
301301
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
302302
tcptw->tw_ts_offset = tp->tsoffset;
303+
tw->tw_usec_ts = tp->tcp_usec_ts;
303304
tcptw->tw_last_oow_ack_time = 0;
304305
tcptw->tw_tx_delay = tp->tcp_tx_delay;
305306
tw->tw_txhash = sk->sk_txhash;
@@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
554555
newtp->max_window = newtp->snd_wnd;
555556

556557
if (newtp->rx_opt.tstamp_ok) {
558+
newtp->tcp_usec_ts = treq->req_usec_ts;
557559
newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
558560
newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
559561
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
560562
} else {
563+
newtp->tcp_usec_ts = 0;
561564
newtp->rx_opt.ts_recent_stamp = 0;
562565
newtp->tcp_header_len = sizeof(struct tcphdr);
563566
}
564567
if (req->num_timeout) {
565-
newtp->undo_marker = treq->snt_isn;
566-
newtp->retrans_stamp = div_u64(treq->snt_synack,
567-
USEC_PER_SEC / TCP_TS_HZ);
568568
newtp->total_rto = req->num_timeout;
569+
newtp->undo_marker = treq->snt_isn;
570+
if (newtp->tcp_usec_ts) {
571+
newtp->retrans_stamp = treq->snt_synack;
572+
newtp->total_rto_time = (u32)(tcp_clock_us() -
573+
newtp->retrans_stamp) / USEC_PER_MSEC;
574+
} else {
575+
newtp->retrans_stamp = div_u64(treq->snt_synack,
576+
USEC_PER_SEC / TCP_TS_HZ);
577+
newtp->total_rto_time = tcp_clock_ms() -
578+
newtp->retrans_stamp;
579+
}
569580
newtp->total_rto_recoveries = 1;
570-
newtp->total_rto_time = tcp_clock_ms() -
571-
newtp->retrans_stamp;
572581
}
573582
newtp->tsoffset = treq->ts_off;
574583
#ifdef CONFIG_TCP_MD5SIG

net/ipv4/tcp_output.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
799799

800800
if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
801801
opts->options |= OPTION_TS;
802-
opts->tsval = tcp_skb_timestamp_ts(false, skb) + tp->tsoffset;
802+
opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
803803
opts->tsecr = tp->rx_opt.ts_recent;
804804
remaining -= TCPOLEN_TSTAMP_ALIGNED;
805805
}
@@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
884884
}
885885
if (likely(ireq->tstamp_ok)) {
886886
opts->options |= OPTION_TS;
887-
opts->tsval = tcp_skb_timestamp_ts(false, skb) + tcp_rsk(req)->ts_off;
887+
opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
888+
tcp_rsk(req)->ts_off;
888889
opts->tsecr = READ_ONCE(req->ts_recent);
889890
remaining -= TCPOLEN_TSTAMP_ALIGNED;
890891
}
@@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
943944

944945
if (likely(tp->rx_opt.tstamp_ok)) {
945946
opts->options |= OPTION_TS;
946-
opts->tsval = skb ? tcp_skb_timestamp_ts(false, skb) + tp->tsoffset : 0;
947+
opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
948+
tp->tsoffset : 0;
947949
opts->tsecr = tp->rx_opt.ts_recent;
948950
size += TCPOLEN_TSTAMP_ALIGNED;
949951
}
@@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
33793381

33803382
/* Save stamp of the first (attempted) retransmit. */
33813383
if (!tp->retrans_stamp)
3382-
tp->retrans_stamp = tcp_skb_timestamp_ts(false, skb);
3384+
tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
33833385

33843386
if (tp->undo_retrans < 0)
33853387
tp->undo_retrans = 0;
@@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
36653667
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
36663668

36673669
memset(&opts, 0, sizeof(opts));
3670+
if (tcp_rsk(req)->req_usec_ts < 0)
3671+
tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
36683672
now = tcp_clock_ns();
36693673
#ifdef CONFIG_SYN_COOKIES
36703674
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))

net/ipv4/tcp_timer.c

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,18 @@
2626
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
2727
{
2828
struct inet_connection_sock *icsk = inet_csk(sk);
29-
u32 elapsed, start_ts, user_timeout;
29+
const struct tcp_sock *tp = tcp_sk(sk);
30+
u32 elapsed, user_timeout;
3031
s32 remaining;
3132

32-
start_ts = tcp_sk(sk)->retrans_stamp;
3333
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
3434
if (!user_timeout)
3535
return icsk->icsk_rto;
36-
elapsed = tcp_time_stamp_ts(tcp_sk(sk)) - start_ts;
36+
37+
elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
38+
if (tp->tcp_usec_ts)
39+
elapsed /= USEC_PER_MSEC;
40+
3741
remaining = user_timeout - elapsed;
3842
if (remaining <= 0)
3943
return 1; /* user timeout has passed; fire ASAP */
@@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk,
212216
unsigned int boundary,
213217
unsigned int timeout)
214218
{
215-
unsigned int start_ts;
219+
struct tcp_sock *tp = tcp_sk(sk);
220+
unsigned int start_ts, delta;
216221

217222
if (!inet_csk(sk)->icsk_retransmits)
218223
return false;
219224

220-
start_ts = tcp_sk(sk)->retrans_stamp;
225+
start_ts = tp->retrans_stamp;
221226
if (likely(timeout == 0)) {
222227
unsigned int rto_base = TCP_RTO_MIN;
223228

@@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk,
226231
timeout = tcp_model_timeout(sk, boundary, rto_base);
227232
}
228233

229-
return (s32)(tcp_time_stamp_ts(tcp_sk(sk)) - start_ts - timeout) >= 0;
234+
if (tp->tcp_usec_ts) {
235+
/* delta maybe off up to a jiffy due to timer granularity. */
236+
delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
237+
return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
238+
}
239+
return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
230240
}
231241

232242
/* A write timeout has occurred. Process the after effects. */
@@ -468,20 +478,18 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
468478
}
469479

470480
static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
471-
const struct sk_buff *skb)
481+
const struct sk_buff *skb,
482+
u32 rtx_delta)
472483
{
473484
const struct tcp_sock *tp = tcp_sk(sk);
474485
const int timeout = TCP_RTO_MAX * 2;
475-
u32 rcv_delta, rtx_delta;
486+
u32 rcv_delta;
476487

477488
rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
478489
if (rcv_delta <= timeout)
479490
return false;
480491

481-
rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp_ts(tp) -
482-
(tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb)));
483-
484-
return rtx_delta > timeout;
492+
return msecs_to_jiffies(rtx_delta) > timeout;
485493
}
486494

487495
/**
@@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk)
534542
struct inet_sock *inet = inet_sk(sk);
535543
u32 rtx_delta;
536544

537-
rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(false, skb));
545+
rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
546+
tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
547+
if (tp->tcp_usec_ts)
548+
rtx_delta /= USEC_PER_MSEC;
549+
538550
if (sk->sk_family == AF_INET) {
539551
net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
540552
&inet->inet_daddr, ntohs(inet->inet_dport),
@@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk)
551563
rtx_delta);
552564
}
553565
#endif
554-
if (tcp_rtx_probe0_timed_out(sk, skb)) {
566+
if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
555567
tcp_write_err(sk);
556568
goto out;
557569
}

0 commit comments

Comments
 (0)