Skip to content

Commit 41063e9

Browse files
committed
ipv4: Early TCP socket demux.
Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent f9242b6 commit 41063e9

File tree

10 files changed

+110
-24
lines changed

10 files changed

+110
-24
lines changed

include/net/inet_hashtables.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,10 +379,10 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
379379
const __be16 sport,
380380
const __be16 dport)
381381
{
382-
struct sock *sk;
382+
struct sock *sk = skb_steal_sock(skb);
383383
const struct iphdr *iph = ip_hdr(skb);
384384

385-
if (unlikely(sk = skb_steal_sock(skb)))
385+
if (sk)
386386
return sk;
387387
else
388388
return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,

include/net/protocol.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
/* This is used to register protocols. */
3939
struct net_protocol {
40+
int (*early_demux)(struct sk_buff *skb);
4041
int (*handler)(struct sk_buff *skb);
4142
void (*err_handler)(struct sk_buff *skb, u32 info);
4243
int (*gso_send_check)(struct sk_buff *skb);

include/net/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ struct sock {
319319
unsigned long sk_flags;
320320
struct dst_entry *sk_dst_cache;
321321
spinlock_t sk_dst_lock;
322+
struct dst_entry *sk_rx_dst;
322323
atomic_t sk_wmem_alloc;
323324
atomic_t sk_omem_alloc;
324325
int sk_sndbuf;
@@ -1426,6 +1427,7 @@ extern struct sk_buff *sock_rmalloc(struct sock *sk,
14261427
gfp_t priority);
14271428
extern void sock_wfree(struct sk_buff *skb);
14281429
extern void sock_rfree(struct sk_buff *skb);
1430+
extern void sock_edemux(struct sk_buff *skb);
14291431

14301432
extern int sock_setsockopt(struct socket *sock, int level,
14311433
int op, char __user *optval,

include/net/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ extern void tcp_v4_err(struct sk_buff *skb, u32);
325325

326326
extern void tcp_shutdown (struct sock *sk, int how);
327327

328+
extern int tcp_v4_early_demux(struct sk_buff *skb);
328329
extern int tcp_v4_rcv(struct sk_buff *skb);
329330

330331
extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);

net/core/sock.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb)
14651465
}
14661466
EXPORT_SYMBOL(sock_rfree);
14671467

1468+
void sock_edemux(struct sk_buff *skb)
1469+
{
1470+
sock_put(skb->sk);
1471+
}
1472+
EXPORT_SYMBOL(sock_edemux);
14681473

14691474
int sock_i_uid(struct sock *sk)
14701475
{

net/ipv4/af_inet.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
157157

158158
kfree(rcu_dereference_protected(inet->inet_opt, 1));
159159
dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160+
dst_release(sk->sk_rx_dst);
160161
sk_refcnt_debug_dec(sk);
161162
}
162163
EXPORT_SYMBOL(inet_sock_destruct);
@@ -1518,14 +1519,15 @@ static const struct net_protocol igmp_protocol = {
15181519
#endif
15191520

15201521
static const struct net_protocol tcp_protocol = {
1521-
.handler = tcp_v4_rcv,
1522-
.err_handler = tcp_v4_err,
1523-
.gso_send_check = tcp_v4_gso_send_check,
1524-
.gso_segment = tcp_tso_segment,
1525-
.gro_receive = tcp4_gro_receive,
1526-
.gro_complete = tcp4_gro_complete,
1527-
.no_policy = 1,
1528-
.netns_ok = 1,
1522+
.early_demux = tcp_v4_early_demux,
1523+
.handler = tcp_v4_rcv,
1524+
.err_handler = tcp_v4_err,
1525+
.gso_send_check = tcp_v4_gso_send_check,
1526+
.gso_segment = tcp_tso_segment,
1527+
.gro_receive = tcp4_gro_receive,
1528+
.gro_complete = tcp4_gro_complete,
1529+
.no_policy = 1,
1530+
.netns_ok = 1,
15291531
};
15301532

15311533
static const struct net_protocol udp_protocol = {

net/ipv4/ip_input.c

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -323,19 +323,32 @@ static int ip_rcv_finish(struct sk_buff *skb)
323323
* how the packet travels inside Linux networking.
324324
*/
325325
if (skb_dst(skb) == NULL) {
326-
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
327-
iph->tos, skb->dev);
328-
if (unlikely(err)) {
329-
if (err == -EHOSTUNREACH)
330-
IP_INC_STATS_BH(dev_net(skb->dev),
331-
IPSTATS_MIB_INADDRERRORS);
332-
else if (err == -ENETUNREACH)
333-
IP_INC_STATS_BH(dev_net(skb->dev),
334-
IPSTATS_MIB_INNOROUTES);
335-
else if (err == -EXDEV)
336-
NET_INC_STATS_BH(dev_net(skb->dev),
337-
LINUX_MIB_IPRPFILTER);
338-
goto drop;
326+
const struct net_protocol *ipprot;
327+
int protocol = iph->protocol;
328+
int err;
329+
330+
rcu_read_lock();
331+
ipprot = rcu_dereference(inet_protos[protocol]);
332+
err = -ENOENT;
333+
if (ipprot && ipprot->early_demux)
334+
err = ipprot->early_demux(skb);
335+
rcu_read_unlock();
336+
337+
if (err) {
338+
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
339+
iph->tos, skb->dev);
340+
if (unlikely(err)) {
341+
if (err == -EHOSTUNREACH)
342+
IP_INC_STATS_BH(dev_net(skb->dev),
343+
IPSTATS_MIB_INADDRERRORS);
344+
else if (err == -ENETUNREACH)
345+
IP_INC_STATS_BH(dev_net(skb->dev),
346+
IPSTATS_MIB_INNOROUTES);
347+
else if (err == -EXDEV)
348+
NET_INC_STATS_BH(dev_net(skb->dev),
349+
LINUX_MIB_IPRPFILTER);
350+
goto drop;
351+
}
339352
}
340353
}
341354

net/ipv4/tcp_input.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5518,6 +5518,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
55185518
struct tcp_sock *tp = tcp_sk(sk);
55195519
int res;
55205520

5521+
if (sk->sk_rx_dst) {
5522+
struct dst_entry *dst = sk->sk_rx_dst;
5523+
if (unlikely(dst->obsolete)) {
5524+
if (dst->ops->check(dst, 0) == NULL) {
5525+
dst_release(dst);
5526+
sk->sk_rx_dst = NULL;
5527+
}
5528+
}
5529+
}
5530+
if (unlikely(sk->sk_rx_dst == NULL))
5531+
sk->sk_rx_dst = dst_clone(skb_dst(skb));
5532+
55215533
/*
55225534
* Header prediction.
55235535
* The code loosely follows the one in the famous
@@ -5729,8 +5741,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
57295741

57305742
tcp_set_state(sk, TCP_ESTABLISHED);
57315743

5732-
if (skb != NULL)
5744+
if (skb != NULL) {
5745+
sk->sk_rx_dst = dst_clone(skb_dst(skb));
57335746
security_inet_conn_established(sk, skb);
5747+
}
57345748

57355749
/* Make sure socket is routed, for correct metrics. */
57365750
icsk->icsk_af_ops->rebuild_header(sk);

net/ipv4/tcp_ipv4.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,52 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
16711671
}
16721672
EXPORT_SYMBOL(tcp_v4_do_rcv);
16731673

1674+
int tcp_v4_early_demux(struct sk_buff *skb)
1675+
{
1676+
struct net *net = dev_net(skb->dev);
1677+
const struct iphdr *iph;
1678+
const struct tcphdr *th;
1679+
struct sock *sk;
1680+
int err;
1681+
1682+
err = -ENOENT;
1683+
if (skb->pkt_type != PACKET_HOST)
1684+
goto out_err;
1685+
1686+
if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1687+
goto out_err;
1688+
1689+
iph = ip_hdr(skb);
1690+
th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1691+
1692+
if (th->doff < sizeof(struct tcphdr) / 4)
1693+
goto out_err;
1694+
1695+
if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1696+
goto out_err;
1697+
1698+
sk = __inet_lookup_established(net, &tcp_hashinfo,
1699+
iph->saddr, th->source,
1700+
iph->daddr, th->dest,
1701+
skb->dev->ifindex);
1702+
if (sk) {
1703+
skb->sk = sk;
1704+
skb->destructor = sock_edemux;
1705+
if (sk->sk_state != TCP_TIME_WAIT) {
1706+
struct dst_entry *dst = sk->sk_rx_dst;
1707+
if (dst)
1708+
dst = dst_check(dst, 0);
1709+
if (dst) {
1710+
skb_dst_set_noref(skb, dst);
1711+
err = 0;
1712+
}
1713+
}
1714+
}
1715+
1716+
out_err:
1717+
return err;
1718+
}
1719+
16741720
/*
16751721
* From tcp_input.c
16761722
*/

net/ipv4/tcp_minisocks.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
445445
struct tcp_sock *oldtp = tcp_sk(sk);
446446
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
447447

448+
newsk->sk_rx_dst = dst_clone(skb_dst(skb));
449+
448450
/* TCP Cookie Transactions require space for the cookie pair,
449451
* as it differs for each connection. There is no need to
450452
* copy any s_data_payload stored at the original socket.

0 commit comments

Comments
 (0)