Skip to content

Commit 62fa8a8

Browse files
committed
net: Implement read-only protection and COW'ing of metrics.
Routing metrics are now copy-on-write. Initially a route entry points it's metrics at a read-only location. If a routing table entry exists, it will point there. Else it will point at the all zero metric place-holder called 'dst_default_metrics'. The writeability state of the metrics is stored in the low bits of the metrics pointer, we have two bits left to spare if we want to store more states. For the initial implementation, COW is implemented simply via kmalloc. However future enhancements will change this to place the writable metrics somewhere else, in order to increase sharing. Very likely this "somewhere else" will be the inetpeer cache. Note also that this means that metrics updates may transiently fail if we cannot COW the metrics successfully. But even by itself, this patch should decrease memory usage and increase cache locality especially for routing workloads. In those cases the read-only metric copies stay in place and never get written to. TCP workloads where metrics get updated, and those rare cases where PMTU triggers occur, will take a very slight performance hit. But that hit will be alleviated when the long-term writable metrics move to a more sharable location. Since the metrics storage went from a u32 array of RTAX_MAX entries to what is essentially a pointer, some retooling of the dst_entry layout was necessary. Most importantly, we need to preserve the alignment of the reference count so that it doesn't share cache lines with the read-mostly state, as per Eric Dumazet's alignment assertion checks. The only non-trivial bit here is the move of the 'flags' member into the writeable cacheline. This is OK since we are always accessing the flags around the same moment when we made a modification to the reference count. Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent b4e69ac commit 62fa8a8

File tree

9 files changed

+194
-46
lines changed

9 files changed

+194
-46
lines changed

include/net/dst.h

Lines changed: 77 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -40,24 +40,10 @@ struct dst_entry {
4040
struct rcu_head rcu_head;
4141
struct dst_entry *child;
4242
struct net_device *dev;
43-
short error;
44-
short obsolete;
45-
int flags;
46-
#define DST_HOST 0x0001
47-
#define DST_NOXFRM 0x0002
48-
#define DST_NOPOLICY 0x0004
49-
#define DST_NOHASH 0x0008
50-
#define DST_NOCACHE 0x0010
43+
struct dst_ops *ops;
44+
unsigned long _metrics;
5145
unsigned long expires;
52-
53-
unsigned short header_len; /* more space at head required */
54-
unsigned short trailer_len; /* space to reserve at tail */
55-
56-
unsigned int rate_tokens;
57-
unsigned long rate_last; /* rate limiting for ICMP */
58-
5946
struct dst_entry *path;
60-
6147
struct neighbour *neighbour;
6248
struct hh_cache *hh;
6349
#ifdef CONFIG_XFRM
@@ -68,17 +54,16 @@ struct dst_entry {
6854
int (*input)(struct sk_buff*);
6955
int (*output)(struct sk_buff*);
7056

71-
struct dst_ops *ops;
72-
73-
u32 _metrics[RTAX_MAX];
74-
57+
short error;
58+
short obsolete;
59+
unsigned short header_len; /* more space at head required */
60+
unsigned short trailer_len; /* space to reserve at tail */
7561
#ifdef CONFIG_IP_ROUTE_CLASSID
7662
__u32 tclassid;
7763
#else
7864
__u32 __pad2;
7965
#endif
8066

81-
8267
/*
8368
* Align __refcnt to a 64 bytes alignment
8469
* (L1_CACHE_SIZE would be too much)
@@ -93,6 +78,14 @@ struct dst_entry {
9378
atomic_t __refcnt; /* client references */
9479
int __use;
9580
unsigned long lastuse;
81+
unsigned long rate_last; /* rate limiting for ICMP */
82+
unsigned int rate_tokens;
83+
int flags;
84+
#define DST_HOST 0x0001
85+
#define DST_NOXFRM 0x0002
86+
#define DST_NOPOLICY 0x0004
87+
#define DST_NOHASH 0x0008
88+
#define DST_NOCACHE 0x0010
9689
union {
9790
struct dst_entry *next;
9891
struct rtable __rcu *rt_next;
@@ -103,10 +96,69 @@ struct dst_entry {
10396

10497
#ifdef __KERNEL__
10598

99+
extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);
100+
101+
#define DST_METRICS_READ_ONLY 0x1UL
102+
#define __DST_METRICS_PTR(Y) \
103+
((u32 *)((Y) & ~DST_METRICS_READ_ONLY))
104+
#define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics)
105+
106+
static inline bool dst_metrics_read_only(const struct dst_entry *dst)
107+
{
108+
return dst->_metrics & DST_METRICS_READ_ONLY;
109+
}
110+
111+
extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);
112+
113+
static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
114+
{
115+
unsigned long val = dst->_metrics;
116+
if (!(val & DST_METRICS_READ_ONLY))
117+
__dst_destroy_metrics_generic(dst, val);
118+
}
119+
120+
static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
121+
{
122+
unsigned long p = dst->_metrics;
123+
124+
if (p & DST_METRICS_READ_ONLY)
125+
return dst->ops->cow_metrics(dst, p);
126+
return __DST_METRICS_PTR(p);
127+
}
128+
129+
/* This may only be invoked before the entry has reached global
130+
* visibility.
131+
*/
132+
static inline void dst_init_metrics(struct dst_entry *dst,
133+
const u32 *src_metrics,
134+
bool read_only)
135+
{
136+
dst->_metrics = ((unsigned long) src_metrics) |
137+
(read_only ? DST_METRICS_READ_ONLY : 0);
138+
}
139+
140+
static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
141+
{
142+
u32 *dst_metrics = dst_metrics_write_ptr(dest);
143+
144+
if (dst_metrics) {
145+
u32 *src_metrics = DST_METRICS_PTR(src);
146+
147+
memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
148+
}
149+
}
150+
151+
static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
152+
{
153+
return DST_METRICS_PTR(dst);
154+
}
155+
106156
static inline u32
107157
dst_metric_raw(const struct dst_entry *dst, const int metric)
108158
{
109-
return dst->_metrics[metric-1];
159+
u32 *p = DST_METRICS_PTR(dst);
160+
161+
return p[metric-1];
110162
}
111163

112164
static inline u32
@@ -131,22 +183,10 @@ dst_metric_advmss(const struct dst_entry *dst)
131183

132184
static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
133185
{
134-
dst->_metrics[metric-1] = val;
135-
}
136-
137-
static inline void dst_import_metrics(struct dst_entry *dst, const u32 *src_metrics)
138-
{
139-
memcpy(dst->_metrics, src_metrics, RTAX_MAX * sizeof(u32));
140-
}
186+
u32 *p = dst_metrics_write_ptr(dst);
141187

142-
static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
143-
{
144-
dst_import_metrics(dest, src->_metrics);
145-
}
146-
147-
static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
148-
{
149-
return dst->_metrics;
188+
if (p)
189+
p[metric-1] = val;
150190
}
151191

152192
static inline u32

include/net/dst_ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct dst_ops {
1818
struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
1919
unsigned int (*default_advmss)(const struct dst_entry *);
2020
unsigned int (*default_mtu)(const struct dst_entry *);
21+
u32 * (*cow_metrics)(struct dst_entry *, unsigned long);
2122
void (*destroy)(struct dst_entry *);
2223
void (*ifdown)(struct dst_entry *,
2324
struct net_device *dev, int how);

include/net/route.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949

5050
struct fib_nh;
5151
struct inet_peer;
52+
struct fib_info;
5253
struct rtable {
5354
struct dst_entry dst;
5455

@@ -69,6 +70,7 @@ struct rtable {
6970
/* Miscellaneous cached information */
7071
__be32 rt_spec_dst; /* RFC1122 specific destination */
7172
struct inet_peer *peer; /* long-living peer info */
73+
struct fib_info *fi; /* for client ref to shared metrics */
7274
};
7375

7476
static inline bool rt_is_input_route(struct rtable *rt)

net/core/dst.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ int dst_discard(struct sk_buff *skb)
164164
}
165165
EXPORT_SYMBOL(dst_discard);
166166

167+
static const u32 dst_default_metrics[RTAX_MAX];
168+
167169
void *dst_alloc(struct dst_ops *ops)
168170
{
169171
struct dst_entry *dst;
@@ -180,6 +182,7 @@ void *dst_alloc(struct dst_ops *ops)
180182
dst->lastuse = jiffies;
181183
dst->path = dst;
182184
dst->input = dst->output = dst_discard;
185+
dst_init_metrics(dst, dst_default_metrics, true);
183186
#if RT_CACHE_DEBUG >= 2
184187
atomic_inc(&dst_total);
185188
#endif
@@ -282,6 +285,42 @@ void dst_release(struct dst_entry *dst)
282285
}
283286
EXPORT_SYMBOL(dst_release);
284287

288+
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
289+
{
290+
u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
291+
292+
if (p) {
293+
u32 *old_p = __DST_METRICS_PTR(old);
294+
unsigned long prev, new;
295+
296+
memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
297+
298+
new = (unsigned long) p;
299+
prev = cmpxchg(&dst->_metrics, old, new);
300+
301+
if (prev != old) {
302+
kfree(p);
303+
p = __DST_METRICS_PTR(prev);
304+
if (prev & DST_METRICS_READ_ONLY)
305+
p = NULL;
306+
}
307+
}
308+
return p;
309+
}
310+
EXPORT_SYMBOL(dst_cow_metrics_generic);
311+
312+
/* Caller asserts that dst_metrics_read_only(dst) is false. */
313+
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
314+
{
315+
unsigned long prev, new;
316+
317+
new = (unsigned long) dst_default_metrics;
318+
prev = cmpxchg(&dst->_metrics, old, new);
319+
if (prev == old)
320+
kfree(__DST_METRICS_PTR(old));
321+
}
322+
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
323+
285324
/**
286325
* skb_dst_set_noref - sets skb dst, without a reference
287326
* @skb: buffer

net/decnet/dn_route.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ static int dn_dst_gc(struct dst_ops *ops);
112112
static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
113113
static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
114114
static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
115+
static void dn_dst_destroy(struct dst_entry *);
115116
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
116117
static void dn_dst_link_failure(struct sk_buff *);
117118
static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
@@ -133,11 +134,18 @@ static struct dst_ops dn_dst_ops = {
133134
.check = dn_dst_check,
134135
.default_advmss = dn_dst_default_advmss,
135136
.default_mtu = dn_dst_default_mtu,
137+
.cow_metrics = dst_cow_metrics_generic,
138+
.destroy = dn_dst_destroy,
136139
.negative_advice = dn_dst_negative_advice,
137140
.link_failure = dn_dst_link_failure,
138141
.update_pmtu = dn_dst_update_pmtu,
139142
};
140143

144+
static void dn_dst_destroy(struct dst_entry *dst)
145+
{
146+
dst_destroy_metrics_generic(dst);
147+
}
148+
141149
static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
142150
{
143151
__u16 tmp = (__u16 __force)(src ^ dst);
@@ -814,14 +822,14 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
814822
{
815823
struct dn_fib_info *fi = res->fi;
816824
struct net_device *dev = rt->dst.dev;
825+
unsigned int mss_metric;
817826
struct neighbour *n;
818-
unsigned int metric;
819827

820828
if (fi) {
821829
if (DN_FIB_RES_GW(*res) &&
822830
DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
823831
rt->rt_gateway = DN_FIB_RES_GW(*res);
824-
dst_import_metrics(&rt->dst, fi->fib_metrics);
832+
dst_init_metrics(&rt->dst, fi->fib_metrics, true);
825833
}
826834
rt->rt_type = res->type;
827835

@@ -834,10 +842,10 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
834842

835843
if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
836844
dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
837-
metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
838-
if (metric) {
845+
mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
846+
if (mss_metric) {
839847
unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
840-
if (metric > mss)
848+
if (mss_metric > mss)
841849
dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
842850
}
843851
return 0;

net/ipv4/route.c

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,44 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152152
{
153153
}
154154

155+
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156+
{
157+
u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
158+
159+
if (p) {
160+
u32 *old_p = __DST_METRICS_PTR(old);
161+
unsigned long prev, new;
162+
163+
memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
164+
165+
new = (unsigned long) p;
166+
prev = cmpxchg(&dst->_metrics, old, new);
167+
168+
if (prev != old) {
169+
kfree(p);
170+
p = __DST_METRICS_PTR(prev);
171+
if (prev & DST_METRICS_READ_ONLY)
172+
p = NULL;
173+
} else {
174+
struct rtable *rt = (struct rtable *) dst;
175+
176+
if (rt->fi) {
177+
fib_info_put(rt->fi);
178+
rt->fi = NULL;
179+
}
180+
}
181+
}
182+
return p;
183+
}
184+
155185
static struct dst_ops ipv4_dst_ops = {
156186
.family = AF_INET,
157187
.protocol = cpu_to_be16(ETH_P_IP),
158188
.gc = rt_garbage_collect,
159189
.check = ipv4_dst_check,
160190
.default_advmss = ipv4_default_advmss,
161191
.default_mtu = ipv4_default_mtu,
192+
.cow_metrics = ipv4_cow_metrics,
162193
.destroy = ipv4_dst_destroy,
163194
.ifdown = ipv4_dst_ifdown,
164195
.negative_advice = ipv4_negative_advice,
@@ -1441,6 +1472,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
14411472

14421473
if (rt->peer)
14431474
atomic_inc(&rt->peer->refcnt);
1475+
if (rt->fi)
1476+
atomic_inc(&rt->fi->fib_clntref);
14441477

14451478
if (arp_bind_neighbour(&rt->dst) ||
14461479
!(rt->dst.neighbour->nud_state &
@@ -1720,6 +1753,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
17201753
struct rtable *rt = (struct rtable *) dst;
17211754
struct inet_peer *peer = rt->peer;
17221755

1756+
dst_destroy_metrics_generic(dst);
1757+
if (rt->fi) {
1758+
fib_info_put(rt->fi);
1759+
rt->fi = NULL;
1760+
}
17231761
if (peer) {
17241762
rt->peer = NULL;
17251763
inet_putpeer(peer);
@@ -1824,7 +1862,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
18241862
if (FIB_RES_GW(*res) &&
18251863
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
18261864
rt->rt_gateway = FIB_RES_GW(*res);
1827-
dst_import_metrics(dst, fi->fib_metrics);
1865+
rt->fi = fi;
1866+
atomic_inc(&fi->fib_clntref);
1867+
dst_init_metrics(dst, fi->fib_metrics, true);
18281868
#ifdef CONFIG_IP_ROUTE_CLASSID
18291869
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
18301870
#endif
@@ -2752,6 +2792,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
27522792
rt->peer = ort->peer;
27532793
if (rt->peer)
27542794
atomic_inc(&rt->peer->refcnt);
2795+
rt->fi = ort->fi;
2796+
if (rt->fi)
2797+
atomic_inc(&rt->fi->fib_clntref);
27552798

27562799
dst_free(new);
27572800
}

0 commit comments

Comments
 (0)